diff --git a/.github/workflows/archiver.yml b/.github/workflows/archiver.yml
index 114a2ec0ca..5ac17d45a2 100644
--- a/.github/workflows/archiver.yml
+++ b/.github/workflows/archiver.yml
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Sep 2024) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: D. Massaro, A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
 
 #----------------------------------------------------------------------------------------------------------------------------------
 
diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 7dd6a2f963..72ffe64b17 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -1,3 +1,8 @@
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, S. Roiser, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
+
 name: C/C++ CI
 
 on:
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
index b21e98934e..b8ac77c3b8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: D. Massaro, O. Mattelaer, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 # AV - Rename the plugin as CUDACPP_OUTPUT (even if the madgraph4gpu directory is still called CUDACPP_SA_OUTPUT)
 # This can be used in mg5amcnlo in one of two ways:
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu
index 3679e681e1..405faee649 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.cu
@@ -2,10 +2,10 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: O. Mattelaer (Mar 2020) for the MG5aMC CUDACPP plugin.
-! Further modified by: O. Mattelaer, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+! Further modified by: O. Mattelaer, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 
   //==========================================================================
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
index fcfc4b3153..67b3ba40c4 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/aloha/template_files/gpu/helas.h
@@ -2,10 +2,10 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: O. Mattelaer (Mar 2020) for the MG5aMC CUDACPP plugin.
-! Further modified by: O. Mattelaer, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+! Further modified by: O. Mattelaer, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 !==========================================================================
   //--------------------------------------------------------------------------
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index b1739da73d..262d39a736 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
 # Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
index 3f2f65688f..1e2905dad8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc
@@ -1,10 +1,10 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
index 77538b7e1c..a7b1c6f9fd 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc
@@ -1,10 +1,10 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
@@ -217,7 +217,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index 49b928db67..a45024704a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
 // Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
index 6a4b946e74..086aa6a616 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
 // Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index 703ea3781c..a68ae314eb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
 // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
@@ -166,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -193,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -208,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -220,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -314,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -341,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -363,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -385,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -403,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 8da04d7945..c901874333 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
 // Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
@@ -8,9 +8,12 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
index 73ce5b3325..007485ea58 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%%p pr=%%p pi=%%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
index deddc425f5..47e8acbcfa 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_%(model_name)s_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
new file mode 100644
index 0000000000..9e8360023b
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.cc
@@ -0,0 +1,374 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+%(color_matrix_lines)s
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc
index cace65e4b8..aa32fdc2a6 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_cc.inc
@@ -2,10 +2,10 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+! Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 !==========================================================================
 %(function_definitions)s
 } // end namespace
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
index 7175e85bb2..006405432d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc
@@ -2,10 +2,10 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for %(output_name)s by
 %(info_lines)s
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 32d12a5bba..22acd3abe9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -479,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -599,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -782,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -801,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -834,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -878,6 +931,7 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
 	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
@@ -979,6 +1033,7 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
 	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
index 4934c9a53f..1f9f8bbc46 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_boilerplate.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_h.inc
new file mode 100644
index 0000000000..7aee564187
--- /dev/null
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/diagram_h.inc
@@ -0,0 +1,11 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+%(code)s
+
+/* clang-format on */
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 68bbf1b934..dd695e591a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
index 444c848e10..43fa9db7d8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc
@@ -1,10 +1,10 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
@@ -14,6 +14,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 %(hel_amps_h)s
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -23,6 +24,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
index 4e5e942a41..6d9568490d 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_class.inc
@@ -1,10 +1,10 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+! Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 !==========================================================================
   //==========================================================================
   // A class for calculating the matrix elements for
@@ -50,17 +50,17 @@
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = %(nbhel)d; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = %(ndiagrams)d; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = %(ncolor)s; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = __NWF__; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = %(nexternal)d; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = %(nwavefuncs)d; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = %(namp)d;
     //static const int ncomb = %(ncomb)d; // CPPProcess::ncomb
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
index 76b6e773bd..043f311587 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc
@@ -1,7 +1,7 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
 ! Further modified by: J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
@@ -16,20 +16,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_%(model_name)s_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_%(model_name)s_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = %(ncolor)s;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -88,12 +84,58 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
@@ -117,8 +159,10 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
+
+    // Enable SIGFPE traps for Floating Point Exceptions
 #ifdef MGONGPUCPP_DEBUG
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+    fpeEnable();
 #endif
   }
 
@@ -148,6 +192,10 @@ namespace mg5amcCpu
       //m_pars->printDependentCouplings(); // now computed event-by-event (running alphas #373)
     }
     %(initProc_lines)s
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     %(cipdassign)s
@@ -183,6 +231,10 @@ namespace mg5amcCpu
       //Parameters_%(model_name)s::printDependentCouplings(); // now computed event-by-event (running alphas #373)
     }
     %(hardcoded_initProc_lines)s
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -285,26 +337,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -312,25 +364,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -343,7 +410,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -353,26 +420,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV %% 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV %% 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -383,17 +451,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -430,35 +504,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%%d icol=%%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -466,22 +722,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = %(nproc)i;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = %(proc_id)i; // code generation source: %(proc_id_source)s
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) %% mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
index 895b7ec1d6..a9bd9c7728 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc
@@ -1,10 +1,10 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
 %(info_lines)s
@@ -17,6 +17,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_%(model_name)s.h"
 
 #include <vector>
@@ -46,23 +47,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -76,34 +80,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
index 2700d7e7da..e0603558aa 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc
@@ -1,152 +1,15 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+! Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 !==========================================================================
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
 
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?)
-%(color_matrix_lines)s
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
-#else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
-#endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
-#else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
-#endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
-#else
-        deltaMEs += deltaMEs2;
-#endif
-        // === C++ END ===
-#else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
-#endif
-      }
-
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
-#ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", ievt0, ihel, MEs_sv );
-#endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
     return;
   }
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
index d49047a623..32b552d101 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc
@@ -1,22 +1,25 @@
 ! Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 ! Created by: J. Alwall (Jul 2010) for the MG5aMC CPP backend.
 !==========================================================================
-! Copyright (C) 2020-2024 CERN and UCLouvain.
+! Copyright (C) 2020-2025 CERN and UCLouvain.
 ! Licensed under the GNU Lesser General Public License (version 3 or later).
 ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-! Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+! Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 !==========================================================================
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -42,93 +45,63 @@
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which is greater than nchannels=%%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%%d (invalid SDE iconfig=%%d\n > nconfig=%%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%%4d rndcol=%%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%%d icol=%%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -170,7 +143,7 @@
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -193,7 +166,7 @@
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -202,25 +175,31 @@
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -230,8 +209,10 @@
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -247,11 +228,12 @@
         //printf( "sigmaKin: ievt=%%4d rndhel=%%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%%4d ighel=%%d MEs_ighel=%%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%%4d ihel=%%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -353,14 +335,15 @@
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
index 554c97974b..ec4c6fab01 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 import os
 import sys
@@ -211,7 +211,7 @@ def get_header_txt(self, name=None, couplings=None,mode=''):
             output = '%(doublec)s allvertexes[]' % {
                 'doublec': self.type2def['double']}
             comment_output = 'amplitude \'vertex\''
-            template = 'template<class W_ACCESS, class A_ACCESS, class C_ACCESS>'
+            template = 'template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>'
         else:
             output = '%(doublec)s all%(spin)s%(id)d[]' % {
                      'doublec': self.type2def['double'],
@@ -219,7 +219,7 @@ def get_header_txt(self, name=None, couplings=None,mode=''):
                      'id': self.outgoing}
             ###self.declaration.add(('list_complex', output)) # AV BUG FIX - THIS IS NOT NEEDED AND IS WRONG (adds name 'cxtype_sv V3[]')
             comment_output = 'wavefunction \'%s%d[6]\'' % ( self.particles[self.outgoing -1], self.outgoing ) # AV (wavefuncsize=6)
-            template = 'template<class W_ACCESS, class C_ACCESS>'
+            template = 'template<class W_ACCESS, class CD_ACCESS>'
         comment = '// Compute the output %s from the input wavefunctions %s' % ( comment_output, ', '.join(comment_inputs) ) # AV
         indent = ' ' * len( '  %s( ' % name )
         out.write('  %(comment)s\n  %(template)s\n  %(prefix)s void\n  %(name)s( const %(args)s,\n%(indent)s%(output)s )%(suffix)s' %
@@ -258,7 +258,7 @@ def get_declaration_txt(self, add_i=True):
             if type.startswith('list'):
                 out.write('    const %s* %s = W_ACCESS::kernelAccessConst( all%s );\n' % ( self.type2def[type[5:]+'_v'], name, name ) )
             if name.startswith('COUP'): # AV from cxtype_sv to fptype array (running alphas #373)
-                out.write('    const cxtype_sv %s = C_ACCESS::kernelAccessConst( all%s );\n' % ( name, name ) )
+                out.write('    const cxtype_sv %s = CD_ACCESS::kernelAccessConst( all%s );\n' % ( name, name ) )
         if not self.offshell:
             vname = 'vertex'
             access = 'A_ACCESS'
@@ -961,9 +961,9 @@ def super_generate_parameters_class_files(self):
             replace_dict['dcoupsetdpar'] = '\n'.join( dcoupsetdpar )
             dcoupsetdcoup = [ '    ' + line.replace('constexpr cxsmpl<double> ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ]
             replace_dict['dcoupsetdcoup'] = '    ' + '\n'.join( dcoupsetdcoup )
-            dcoupaccessbuffer = [ '    fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ]
+            dcoupaccessbuffer = [ '    fptype* %ss = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupaccessbuffer'] = '\n'.join( dcoupaccessbuffer ) + '\n'
-            dcoupkernelaccess = [ '    cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ]
+            dcoupkernelaccess = [ '    cxtype_sv_ref %ss_sv = CD_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupkernelaccess'] = '\n'.join( dcoupkernelaccess ) + '\n'
             dcoupcompute = [ '    %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ]
             replace_dict['dcoupcompute'] = '\n'.join( dcoupcompute )
@@ -1300,52 +1300,62 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
                                                     color_amplitudes[0],
                                                     multi_channel_map = multi_channel
                                                     )
+            self.diagram_code = self.helas_call_writer.diagram_code # hack? get code in helascallwriter, write it to diagrams.h in oneprocessexporter 
             ###misc.sprint( 'after get_matrix_element_calls', self.matrix_elements[0].get_number_of_wavefunctions() ) # CORRECT value of nwf, eg 5 for gg_tt
             assert len(self.matrix_elements) == 1 # how to handle if this is not true?
             self.couplings2order = self.helas_call_writer.couplings2order
             self.params2order = self.helas_call_writer.params2order
             ret_lines.append("""
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#include \"diagrams.h\"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -1353,50 +1363,124 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
+
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
 #ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\\n", ihel );""")
-            nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions()
-            ret_lines.append("""
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = %i; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)"""%nwavefuncs )
-            ret_lines.append("""
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
-#endif""")
+#endif
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
+#else
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
+      const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
+#endif
+
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+#endif
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
+#endif
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
+#else
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
+#endif
+#else
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
+#endif
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
+#else
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
+#else
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
+#endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------""")
+            self.nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() # this was for nwf in CPPProcess.cc: now keep it for CPPProcess.h
             ret_lines += helas_calls
         else:
             ret_lines.extend([self.get_sigmaKin_single_process(i, me) \
@@ -1410,6 +1494,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name):
             file = self.get_matrix_single_process( i, me, color_amplitudes[i], class_name )
             file = '\n'.join( file.split('\n')[8:] ) # skip first 8 lines in process_matrix.inc (copyright)
             file_extend.append( file )
+            assert i == 0, "more than one ME in get_all_sigmaKin_lines" # AV sanity check (added for color_sum.cc but valid independently)
         ret_lines.extend( file_extend )
         return '\n'.join(ret_lines)
 
@@ -1439,7 +1524,7 @@ def generate_process_files(self):
         self.edit_check_sa()
         self.edit_mgonGPU()
         self.edit_processidfile() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
-        
+        self.edit_colorsum() # AV new file (NB this is Sigma-specific, should not be a symlink to Subprocesses)
         self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
         self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
         self.edit_memoryaccesscouplings() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific)
@@ -1459,6 +1544,12 @@ def generate_process_files(self):
                 PLUGIN_export_cpp.cp( ref, self.path + '/../../test/ref' )
             ###else:
                 ###misc.sprint( 'Test reference file does not exist and will not be copied: ', ref )
+        # Set the value of nwf in CPPProcess.h after generating CPPProcess.cc (workaround for #644)
+        cppprocess_h = os.path.join(self.path, self.include_dir, '%s.h' % self.process_class)
+        with open(cppprocess_h, 'r') as file: data = file.read().replace('__NWF__', '%d'%self.nwavefuncs) 
+        with open(cppprocess_h, 'w') as file: file.write(data)
+        # Generate diagram headers after generating CPPProcess.cc
+        self.edit_diagrams(self.diagram_code)
 
     # SR - generate CMakeLists.txt file inside the P* directory
     def edit_CMakeLists(self):
@@ -1518,6 +1609,28 @@ def edit_processidfile(self):
         ff.write(template % replace_dict)
         ff.close()
 
+    # AV - new method
+    def edit_colorsum(self):
+        """Generate color_sum.cc"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_colorsum')
+        template = open(pjoin(self.template_path,'gpu','color_sum.cc'),'r').read()
+        replace_dict = {}
+        # Extract color matrix again (this was also in get_matrix_single_process called within get_all_sigmaKin_lines)
+        replace_dict['color_matrix_lines'] = self.get_color_matrix_lines(self.matrix_elements[0])
+        ff = open(pjoin(self.path, 'color_sum.cc'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
+
+    # AV - new method
+    def edit_diagrams(self, diagrams):
+        """Generate diagrams.h"""
+        ###misc.sprint('Entering PLUGIN_OneProcessExporter.edit_diagrams')
+        template = open(pjoin(self.template_path,'gpu','diagram_h.inc'),'r').read()
+        replace_dict = {}
+        replace_dict['code'] = ''.join(diagrams) # all diagrams to a single file
+        ff = open(pjoin(self.path, 'diagrams.h'),'w')
+        ff.write(template % replace_dict)
+        ff.close()
 
     def generate_subprocess_directory_end(self, **opt):
         """ opt contain all local variable of the fortran original function"""
@@ -1688,11 +1801,11 @@ def get_color_matrix_lines(self, matrix_element):
         """Return the color matrix definition lines for this matrix element. Split rows in chunks of size n."""
         import madgraph.core.color_algebra as color
         if not matrix_element.get('color_matrix'):
-            return '\n'.join(['      static constexpr fptype2 denom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};'])
+            return '\n'.join(['  static constexpr fptype2 colorDenom[1] = {1.};', 'static const fptype2 cf[1][1] = {1.};'])
         else:
             color_denominators = matrix_element.get('color_matrix').\
                                                  get_line_denominators()
-            denom_string = '      static constexpr fptype2 denom[ncolor] = { %s }; // 1-D array[%i]' \
+            denom_string = '  static constexpr fptype2 colorDenom[ncolor] = { %s }; // 1-D array[%i]' \
                            % ( ', '.join(['%i' % denom for denom in color_denominators]), len(color_denominators) )
             matrix_strings = []
             my_cs = color.ColorString()
@@ -1700,12 +1813,12 @@ def get_color_matrix_lines(self, matrix_element):
                 # Then write the numerators for the matrix elements
                 num_list = matrix_element.get('color_matrix').get_line_numerators(index, denominator)
                 matrix_strings.append('{ %s }' % ', '.join(['%d' % i for i in num_list]))
-            matrix_string = '      static constexpr fptype2 cf[ncolor][ncolor] = '
-            if len( matrix_strings ) > 1 : matrix_string += '{\n        ' + ',\n        '.join(matrix_strings) + ' };'
+            matrix_string = '  static constexpr fptype2 colorMatrix[ncolor][ncolor] = '
+            if len( matrix_strings ) > 1 : matrix_string += '{\n    ' + ',\n    '.join(matrix_strings) + ' };'
             else: matrix_string += '{ ' + matrix_strings[0] + ' };'
             matrix_string += ' // 2-D array[%i][%i]' % ( len(color_denominators), len(color_denominators) )
-            denom_comment = '\n      // The color denominators (initialize all array elements, with ncolor=%i)\n      // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
-            matrix_comment = '\n      // The color matrix (initialize all array elements, with ncolor=%i)\n      // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
+            denom_comment = '\n  // The color denominators (initialize all array elements, with ncolor=%i)\n  // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
+            matrix_comment = '\n  // The color matrix (initialize all array elements, with ncolor=%i)\n  // [NB do keep \'static\' for these constexpr arrays, see issue #283]\n' % len(color_denominators)
             denom_string = denom_comment + denom_string
             matrix_string = matrix_comment + matrix_string
             return '\n'.join([denom_string, matrix_string])
@@ -1857,6 +1970,103 @@ def format_coupling(self, call):
     def format_call(call):
         return call.replace('(','( ').replace(')',' )').replace(',',', ')
 
+    # AV - new method
+    def get_one_diagram_code(self, diagram, id_amp, multi_channel_map, diag_to_config, color, ndiagrams):
+        res = []
+        idiagram = diagram.get('number')
+        ###print('DIAGRAM %3d: #wavefunctions=%3d, #diagrams=%3d' %
+        ###      (diagram.get('number'), len(diagram.get('wavefunctions')), len(diagram.get('amplitudes')) )) # AV - FOR DEBUGGING
+        # 1 - Header
+        if idiagram == 1:
+            res.append("""
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include \"diagram_boilerplate.h\"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif""")
+        else:
+            sidiag = '%i'%idiagram
+            indent = ' '*(len(sidiag)-1)
+            res.append("""
+
+  __global__ void
+  diagram%s( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+%s            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+%s            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+%s            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+%s            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+%s            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+%s            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include \"diagram_boilerplate.h\""""%(sidiag,indent,indent,indent,indent,indent,indent))
+        # 2 - Core code
+        res.append('    // *** DIAGRAM %i OF %i ***' % ( idiagram, ndiagrams ) ) # AV
+        res.append('    // Wavefunction(s) for diagram number %d' % idiagram) # AV
+        for wf in diagram.get('wavefunctions'):
+            wfline =  '    '+self.get_wavefunction_call(wf) # AV new: add formatting
+            if wfline[-1] == '\n': wfline = wfline[:-1]
+            res.append( wfline )
+        if len(diagram.get('wavefunctions')) == 0 : res.append('    // (none)') # AV
+        res.append('    // Amplitude(s) for diagram number %d' % idiagram)
+        for amplitude in diagram.get('amplitudes'):
+            id_amp +=1
+            namp = amplitude.get('number')
+            amplitude.set('number', 1)
+            res.append('    '+self.get_amplitude_call(amplitude)) # AV new: add formatting
+            if multi_channel_map: # different code bases #473 (assume this is the same as self.include_multi_channel...)
+                if id_amp in diag_to_config:
+                    ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472
+                    ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472
+                    res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
+                    res.append("    if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % idiagram)
+                    res.append("    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );")
+                    res.append("#endif")
+            else:
+                res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
+                res.append("    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)")
+                res.append("#endif")
+            for njamp, coeff in color[namp].items():
+                scoeff = PLUGIN_OneProcessExporter.coeff(*coeff) # AV
+                if scoeff[0] == '+' : scoeff = scoeff[1:]
+                scoeff = scoeff.replace('(','( ')
+                scoeff = scoeff.replace(')',' )')
+                scoeff = scoeff.replace(',',', ')
+                scoeff = scoeff.replace('*',' * ')
+                scoeff = scoeff.replace('/',' / ')
+                if scoeff.startswith('-'):
+                    res.append('    J_ACCESS::kernelAccessIcol( jamps, %s ) -= %samp_sv[0];' % (njamp, scoeff[1:]))
+                else:
+                    res.append('    J_ACCESS::kernelAccessIcol( jamps, %s ) += %samp_sv[0];' % (njamp, scoeff))
+        if len(diagram.get('amplitudes')) == 0 : res.append('    // (none)') # AV
+        # 3 - Footer
+        res.append("""  }
+  
+  //--------------------------------------------------------------------------""")
+        # Return
+        return res, id_amp
+
     # AV - replace helas_call_writers.GPUFOHelasCallWriter method (improve formatting)
     def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi_channel_map=False):
         """Return a list of strings, corresponding to the Helas calls for the matrix element"""
@@ -1880,55 +2090,6 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
         ###misc.sprint(multi_channel_map)
         res = []
         ###res.append('for(int i=0;i<%s;i++){jamp[i] = cxtype(0.,0.);}' % len(color_amplitudes))
-        res.append("""//constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
-#else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
-      const fptype* COUPs[nxcoup];
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif""")
         diagrams = matrix_element.get('diagrams')
         diag_to_config = {}
         if multi_channel_map:
@@ -1938,45 +2099,22 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi
                                        idiag in multi_channel_map[config]], [])]
                 diag_to_config[amp[0]] = config
         ###misc.sprint(diag_to_config)
+        res.append('\n      // *** DIAGRAMS 1 TO %d ***' % (len(matrix_element.get('diagrams'))) ) # AV
+        res.append('#ifdef MGONGPUCPP_GPUIMPL')
+        for idiagram in range(1,len(matrix_element.get('diagrams'))+1):
+            if idiagram == 1: res.append('gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );')
+            else: res.append('gpuLaunchKernelStream( diagram%i, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );'%idiagram)
+        res.append('#else')
+        for idiagram in range(1,len(matrix_element.get('diagrams'))+1):
+            if idiagram == 1: res.append('diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );')
+            else: res.append('diagram%i( wfs, jamps, channelIds, COUPs, numerators, denominators );'%idiagram)
+        res.append('#endif')
+        # Generate diagram code
+        self.diagram_code = []
         id_amp = 0
         for diagram in matrix_element.get('diagrams'):
-            ###print('DIAGRAM %3d: #wavefunctions=%3d, #diagrams=%3d' %
-            ###      (diagram.get('number'), len(diagram.get('wavefunctions')), len(diagram.get('amplitudes')) )) # AV - FOR DEBUGGING
-            res.append('\n      // *** DIAGRAM %d OF %d ***' % (diagram.get('number'), len(matrix_element.get('diagrams'))) ) # AV
-            res.append('\n      // Wavefunction(s) for diagram number %d' % diagram.get('number')) # AV
-            res.extend([ self.get_wavefunction_call(wf) for wf in diagram.get('wavefunctions') ]) # AV new: avoid format_call
-            if len(diagram.get('wavefunctions')) == 0 : res.append('// (none)') # AV
-            if res[-1][-1] == '\n' : res[-1] = res[-1][:-1]
-            res.append('\n      // Amplitude(s) for diagram number %d' % diagram.get('number'))
-            for amplitude in diagram.get('amplitudes'):
-                id_amp +=1
-                namp = amplitude.get('number')
-                amplitude.set('number', 1)
-                res.append(self.get_amplitude_call(amplitude)) # AV new: avoid format_call
-                if multi_channel_map: # different code bases #473 (assume this is the same as self.include_multi_channel...)
-                    if id_amp in diag_to_config:
-                        ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diag_to_config[id_amp]) # BUG #472
-                        ###res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % id_amp) # wrong fix for BUG #472
-                        res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
-                        res.append("if( channelId == %i ) numerators_sv += cxabs2( amp_sv[0] );" % diagram.get('number'))
-                        res.append("if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );")
-                        res.append("#endif")
-                else:
-                    res.append("#ifdef MGONGPU_SUPPORTS_MULTICHANNEL")
-                    res.append("// Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)")
-                    res.append("#endif")
-                for njamp, coeff in color[namp].items():
-                    scoeff = PLUGIN_OneProcessExporter.coeff(*coeff) # AV
-                    if scoeff[0] == '+' : scoeff = scoeff[1:]
-                    scoeff = scoeff.replace('(','( ')
-                    scoeff = scoeff.replace(')',' )')
-                    scoeff = scoeff.replace(',',', ')
-                    scoeff = scoeff.replace('*',' * ')
-                    scoeff = scoeff.replace('/',' / ')
-                    if scoeff.startswith('-'): res.append('jamp_sv[%s] -= %samp_sv[0];' % (njamp, scoeff[1:])) # AV
-                    else: res.append('jamp_sv[%s] += %samp_sv[0];' % (njamp, scoeff)) # AV
-            if len(diagram.get('amplitudes')) == 0 : res.append('// (none)') # AV
-        ###res.append('\n    // *** END OF DIAGRAMS ***' ) # AV - no longer needed ('COLOR MATRIX BELOW')
+            res_diagram, id_amp = self.get_one_diagram_code(diagram, id_amp, multi_channel_map, diag_to_config, color, len(matrix_element.get('diagrams')))
+            self.diagram_code.append( '\n'.join(res_diagram) )
         return res
 
     # AV - overload helas_call_writers.GPUFOHelasCallWriter method (improve formatting)
@@ -2148,8 +2286,8 @@ def generate_helas_call(self, argument):
             if usesdepcoupl is None: raise Exception('PANIC! could not determine if this call uses aS-dependent or aS-independent couplings?')
             elif usesdepcoupl: caccess = 'CD_ACCESS'
             else: caccess = 'CI_ACCESS'
-            ###if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += '<W_ACCESS, A_ACCESS, C_ACCESS>'
-            ###else : arg['routine_name'] += '<W_ACCESS, C_ACCESS>'
+            ###if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += '<W_ACCESS, A_ACCESS, CD_ACCESS>'
+            ###else : arg['routine_name'] += '<W_ACCESS, CD_ACCESS>'
             if arg['routine_name'].endswith( '_0' ) : arg['routine_name'] += '<W_ACCESS, A_ACCESS, %s>'%caccess
             else : arg['routine_name'] += '<W_ACCESS, %s>'%caccess
             if isinstance(argument, helas_objects.HelasWavefunction):
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 209f088314..ff0d1f10d4 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 import os
 import sys
@@ -103,6 +103,8 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                              s+'CMake/src/CMakeLists.txt' ],
                      'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h',
                                       s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h',
+                                      s+'gpu/color_sum.h',
+                                      s+'gpu/diagram_boilerplate.h',
                                       s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h',
                                       s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h',
                                       s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h',
@@ -126,6 +128,8 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
 
     to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
                     'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
+                    'color_sum.h',
+                    'diagram_boilerplate.h',
                     'MemoryAccessHelpers.h', 'MemoryAccessVectors.h',
                     'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h',
                     'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h',
@@ -222,6 +226,7 @@ def generate_subprocess_directory(self, subproc_group, fortran_model, me=None):
         misc.sprint("need to link", self.to_link_in_P)
         out = super().generate_subprocess_directory(subproc_group, fortran_model, me)
         return out
+
     # AV (default from OM's tutorial) - add a debug printout
     def convert_model(self, model, wanted_lorentz=[], wanted_coupling=[]):
         if hasattr(model , 'cudacpp_wanted_ordered_couplings'):
diff --git a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh
index 097935efc8..acb186f448 100755
--- a/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/allGenerateAndCompare.sh
@@ -1,45 +1,76 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Oct 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 set -e # fail on error
 
 cd $(dirname $0)/..
 
-./CODEGEN/generateAndCompare.sh -q ee_mumu
-./CODEGEN/generateAndCompare.sh -q ee_mumu --mad
+bsm=
+while [ "$1" != "" ]; do
+  if [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then
+    bsm=$1
+    shift
+  elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then
+    bsm=$1
+    shift
+  else
+    echo "Usage: $0 [-bsmonly|-nobsm]"
+  fi
+done
 
-./CODEGEN/generateAndCompare.sh -q gg_tt
-./CODEGEN/generateAndCompare.sh -q gg_tt --mad
+# SM processes (both mad and sa)
 
-./CODEGEN/generateAndCompare.sh -q gg_ttg
-./CODEGEN/generateAndCompare.sh -q gg_ttg --mad
+if [ "${bsm}" != "-bsmonly" ]; then
 
-./CODEGEN/generateAndCompare.sh -q gg_ttgg
-./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad
+  ./CODEGEN/generateAndCompare.sh -q ee_mumu
+  ./CODEGEN/generateAndCompare.sh -q ee_mumu --mad
 
-./CODEGEN/generateAndCompare.sh -q gg_ttggg
-./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_tt
+  ./CODEGEN/generateAndCompare.sh -q gg_tt --mad
 
-./CODEGEN/generateAndCompare.sh -q gq_ttq
-./CODEGEN/generateAndCompare.sh -q gq_ttq --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_ttg
+  ./CODEGEN/generateAndCompare.sh -q gg_ttg --mad
 
-./CODEGEN/generateAndCompare.sh -q heft_gg_bb
-./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_ttgg
+  ./CODEGEN/generateAndCompare.sh -q gg_ttgg --mad
 
-./CODEGEN/generateAndCompare.sh -q susy_gg_tt
-./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad
+  ./CODEGEN/generateAndCompare.sh -q gg_ttggg
+  ./CODEGEN/generateAndCompare.sh -q gg_ttggg --mad
 
-./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1
-./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad
+  ./CODEGEN/generateAndCompare.sh -q gq_ttq
+  ./CODEGEN/generateAndCompare.sh -q gq_ttq --mad
 
-./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt
-./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad
+fi
 
-./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad
+# BSM processes
 
-./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad
+if [ "${bsm}" != "-nobsm" ]; then
 
-./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad
+  ./CODEGEN/generateAndCompare.sh -q heft_gg_bb
+  ./CODEGEN/generateAndCompare.sh -q heft_gg_bb --mad
+
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_tt
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_tt --mad
+
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1
+  ./CODEGEN/generateAndCompare.sh -q susy_gg_t1t1 --mad
+
+  ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt
+  ./CODEGEN/generateAndCompare.sh -q smeft_gg_tttt --mad
+
+  ./CODEGEN/generateAndCompare.sh -q nobm_pp_ttW --mad
+
+fi
+
+# SM processes (mad only)
+
+if [ "${bsm}" != "-bsmonly" ]; then
+
+  ./CODEGEN/generateAndCompare.sh -q gg_tt01g --mad
+
+  ./CODEGEN/generateAndCompare.sh -q pp_tt012j --mad
+
+fi
diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 6221b1cfee..8e36c0eb3e 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 set -e # fail on error
 
@@ -358,10 +358,12 @@ function codeGenAndDiff()
         fi
       done
     fi
-    # Remove card.jpg, diagrams.html and matrix*.jpg files (NB: these are only created if ghostscript is installed)
+    # Remove card.jpg/png, diagrams.html and matrix*.jpg/png files (NB: these are only created if ghostscript is installed)
     \rm -f ${outproc}/SubProcesses/P*/card.jpg
+    \rm -f ${outproc}/SubProcesses/P*/card.png
     \rm -f ${outproc}/SubProcesses/P*/diagrams.html
     \rm -f ${outproc}/SubProcesses/P*/matrix*jpg
+    \rm -f ${outproc}/SubProcesses/P*/matrix*png
     # Cleanup
     \rm -f ${outproc}/crossx.html
     \rm -f ${outproc}/index.html
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index db84a9053c..3226606748 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006200551986694336 [0m
+[1;32mDEBUG: model prefixing  takes 0.005434751510620117 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,16 +155,16 @@ Total: 1 processes with 2 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
@@ -176,22 +176,22 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1665][0m [0m
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.070 s
+Wrote files for 8 helas calls in 0.073 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.201 s
+ALOHA: aloha creates 3 routines in  0.194 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.253 s
+ALOHA: aloha creates 7 routines in  0.243 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -200,37 +200,37 @@ ALOHA: aloha creates 7 routines in  0.253 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 236 (offset 9 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.054s
-user	0m1.767s
-sys	0m0.275s
+real	0m2.060s
+user	0m1.796s
+sys	0m0.262s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -244,7 +244,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -252,9 +252,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -274,7 +274,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -282,9 +282,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index bb623f867a..7f8313745d 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
index 74f70b567b..c1037c83d7 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
@@ -112,6 +112,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
index 68ee164d00..4ba7540657 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
@@ -112,6 +112,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/.make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/ee_mumu.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/makefile b/epochX/cudacpp/ee_mumu.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/makefile
+++ b/epochX/cudacpp/ee_mumu.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc
index 80d5ae41aa..83061d9ae9 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc
+++ b/epochX/cudacpp/ee_mumu.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
index 7bd57a8dbb..cad6526137 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 1;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,281 +279,137 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
+#else
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
       const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
+      const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
+
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
       const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
       const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
-      const fptype* COUPs[nxcoup];
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
 #endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
 
-      // *** DIAGRAM 1 OF 2 ***
-
-      // Wavefunction(s) for diagram number 1
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 );
-
-      FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 2 OF 2 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?)
-
-      // The color denominators (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1]
-
-      // The color matrix (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 2 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -539,7 +448,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -572,6 +485,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
@@ -613,6 +530,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -715,26 +636,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -742,25 +663,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -773,7 +709,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -783,26 +719,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -813,17 +750,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -860,35 +803,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -898,20 +1023,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -923,17 +1042,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -959,93 +1081,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1087,7 +1179,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1110,7 +1202,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1119,25 +1211,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1147,8 +1245,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1164,11 +1264,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1270,14 +1371,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
index 159826a904..50da8f60b2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 2;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
index 70fe04e4d8..27da02d9c0 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
index 280eff025e..a453b7c2b6 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         EP1=PDG2PDF(LPP(IB(1)),-11, IB(1),XBK(IB(1)), QSCALE)
         IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 ) =
@@ -149,7 +149,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         EM2=PDG2PDF(LPP(IB(2)),11, IB(2),XBK(IB(2)), QSCALE)
         IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 ) =
@@ -228,7 +228,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -302,6 +302,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -385,14 +389,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             EP1(IVEC)=PDG2PDF(LPP(IB(1)),-11, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             IF (PDLABEL.EQ.'dressed') EP1_COMPONENTS(1:4 , IVEC) =
      $        EE_COMPONENTS(1:4)
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             EM2(IVEC)=PDG2PDF(LPP(IB(2)),11, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             IF (PDLABEL.EQ.'dressed') EM2_COMPONENTS(1:4 , IVEC) =
      $        EE_COMPONENTS(1:4)
           ENDIF
@@ -502,6 +506,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc
new file mode 100644
index 0000000000..8fbdb5c7fb
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.cc
@@ -0,0 +1,381 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1]
+
+  // The color matrix (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagrams.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagrams.h
new file mode 100644
index 0000000000..2fca11bf87
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/diagrams.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 2 ***
+    // Wavefunction(s) for diagram number 1
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 );
+    FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 2 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f
index ec5722702a..c08048ad0e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
index 1a2e5df4e6..ad813c359c 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -325,7 +325,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -368,7 +368,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -411,17 +412,22 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  1) /1.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  1) /1/
 C     1 ColorOne()
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WZ.NE.0D0) FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WZ)
+        FK_ZERO = 0D0
+        IF(MDL_WZ.NE.0D0) THEN
+          FK_MDL_WZ = SIGN(MAX(ABS(MDL_WZ), ABS(MDL_MZ
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WZ)
+        ELSE
+          FK_MDL_WZ = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -455,10 +461,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -467,6 +475,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/madevent b/epochX/cudacpp/ee_mumu.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/madevent
+++ b/epochX/cudacpp/ee_mumu.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
index 18f664e0d1..a5438a65b0 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV4_0( const fptype allF1[],
           const fptype allF2[],
@@ -922,7 +922,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
@@ -935,7 +935,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_4_0( const fptype allF1[],
             const fptype allF2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
@@ -964,7 +964,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -977,7 +977,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -989,7 +989,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1002,7 +1002,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1020,7 +1020,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_0( const fptype allF1[],
           const fptype allF2[],
@@ -1033,7 +1033,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) );
@@ -1045,7 +1045,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
@@ -1058,7 +1058,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. );
@@ -1078,7 +1078,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV4_0( const fptype allF1[],
           const fptype allF2[],
@@ -1091,7 +1091,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     constexpr fptype one( 1. );
@@ -1106,7 +1106,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
@@ -1119,7 +1119,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. );
@@ -1142,7 +1142,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_4_0( const fptype allF1[],
             const fptype allF2[],
@@ -1157,8 +1157,8 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 );
-    const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 );
+    const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 );
+    const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     constexpr fptype one( 1. );
@@ -1173,7 +1173,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
@@ -1188,8 +1188,8 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 );
-    const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 );
+    const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 );
+    const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. );
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
index 37676c1d8d..68296642b5 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
index 5fcde71f6b..edabf077ce 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -283,7 +283,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index f27925604a..da149d8161 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006340742111206055 [0m
+[1;32mDEBUG: model prefixing  takes 0.005414009094238281 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,28 +154,28 @@ INFO: Process has 2 diagrams
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.267 s
+ALOHA: aloha creates 4 routines in  0.259 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -184,17 +184,17 @@ ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m0.659s
-user	0m0.589s
-sys	0m0.056s
+real	0m0.701s
+user	0m0.597s
+sys	0m0.047s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index 16a91dd141..cad6526137 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 1;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,279 +279,137 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
+#else
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
       const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
+      const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
+
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
       const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
       const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
-      const fptype* COUPs[nxcoup];
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
 #endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
 
-      // *** DIAGRAM 1 OF 2 ***
-
-      // Wavefunction(s) for diagram number 1
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 );
-
-      FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 2 OF 2 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_epem_mupmum()?)
-
-      // The color denominators (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1 }; // 1-D array[1]
-
-      // The color matrix (initialize all array elements, with ncolor=1)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 2 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -537,7 +448,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -570,6 +485,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ };
@@ -611,6 +530,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -713,26 +636,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -740,25 +663,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -771,7 +709,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -781,26 +719,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -811,17 +750,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -858,35 +803,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -903,13 +1030,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -921,17 +1042,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -957,93 +1081,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1085,7 +1179,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1108,7 +1202,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1117,25 +1211,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1145,8 +1245,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1162,11 +1264,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1268,14 +1371,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index 159826a904..50da8f60b2 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 1; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 2;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc
new file mode 100644
index 0000000000..8fbdb5c7fb
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.cc
@@ -0,0 +1,381 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1 }; // 1-D array[1]
+
+  // The color matrix (initialize all array elements, with ncolor=1)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagrams.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagrams.h
new file mode 100644
index 0000000000..f72dba0545
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/diagrams.h
@@ -0,0 +1,77 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 2 ***
+    // Wavefunction(s) for diagram number 1
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][2], -1, w_fp[2], 2 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][3], +1, w_fp[3], 3 );
+    FFV1P0_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 2 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_4_3<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[0], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV2_4_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[2], w_fp[3], w_fp[4], COUPs[ndcoup + 1], 1.0, COUPs[ndcoup + 2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
index 18f664e0d1..a5438a65b0 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV4_0( const fptype allF1[],
           const fptype allF2[],
@@ -922,7 +922,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
@@ -935,7 +935,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_4_0( const fptype allF1[],
             const fptype allF2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
@@ -964,7 +964,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -977,7 +977,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -989,7 +989,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1002,7 +1002,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1020,7 +1020,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_0( const fptype allF1[],
           const fptype allF2[],
@@ -1033,7 +1033,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) );
@@ -1045,7 +1045,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_3( const fptype allF1[],
           const fptype allF2[],
@@ -1058,7 +1058,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. );
@@ -1078,7 +1078,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV4_0( const fptype allF1[],
           const fptype allF2[],
@@ -1091,7 +1091,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     constexpr fptype one( 1. );
@@ -1106,7 +1106,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV4_3( const fptype allF1[],
           const fptype allF2[],
@@ -1119,7 +1119,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. );
@@ -1142,7 +1142,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_4_0( const fptype allF1[],
             const fptype allF2[],
@@ -1157,8 +1157,8 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 );
-    const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 );
+    const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 );
+    const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     constexpr fptype one( 1. );
@@ -1173,7 +1173,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_4_3( const fptype allF1[],
             const fptype allF2[],
@@ -1188,8 +1188,8 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP1 = C_ACCESS::kernelAccessConst( allCOUP1 );
-    const cxtype_sv COUP2 = C_ACCESS::kernelAccessConst( allCOUP2 );
+    const cxtype_sv COUP1 = CD_ACCESS::kernelAccessConst( allCOUP1 );
+    const cxtype_sv COUP2 = CD_ACCESS::kernelAccessConst( allCOUP2 );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype OM3 = ( M3 != 0. ? 1. / ( M3 * M3 ) : 0. );
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
index 37676c1d8d..68296642b5 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
index 5fcde71f6b..edabf077ce 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -283,7 +283,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/ee_mumu.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 453da8d298..9a3da618a4 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0065233707427978516 [0m
+[1;32mDEBUG: model prefixing  takes 0.005475044250488281 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -156,16 +156,16 @@ Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -177,53 +177,53 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1665][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.072 s
+Wrote files for 10 helas calls in 0.074 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.142 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.127 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.991s
-user	0m1.616s
-sys	0m0.275s
+real	0m1.923s
+user	0m1.671s
+sys	0m0.251s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -237,7 +237,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -245,9 +245,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -267,7 +267,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -275,9 +275,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 66598786f5..8e1283453a 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/Source/.make_opts b/epochX/cudacpp/gg_tt.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_tt.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_tt.mad/Source/makefile b/epochX/cudacpp/gg_tt.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/makefile
+++ b/epochX/cudacpp/gg_tt.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index b32f4b931e..ede8c7f653 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,297 +279,139 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // *** DIAGRAM 1 OF 3 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 3 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
 
-      // *** DIAGRAM 3 OF 3 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 3 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -555,7 +450,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +487,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +531,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -730,26 +637,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +664,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +710,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +720,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -828,17 +751,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,35 +804,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -913,20 +1024,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,17 +1043,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -974,93 +1082,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1203,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,25 +1212,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1162,8 +1246,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1265,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1372,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index feff1cc6e1..a55660afd2 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 3;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index bc9bcfeb9b..ce175a75a8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index db3c284caa..bd3d520785 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h
new file mode 100644
index 0000000000..962978409f
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 3 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 3 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 3 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
index ec5722702a..c08048ad0e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index 707ea40323..97656450ad 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_tt.mad/bin/madevent b/epochX/cudacpp/gg_tt.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/madevent
+++ b/epochX/cudacpp/gg_tt.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
index febf1dcf42..5ca71ae17f 100644
--- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -962,7 +962,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -974,7 +974,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -987,7 +987,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1006,7 +1006,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1019,7 +1019,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
index d09f387480..e6b63799bd 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
index ba434e7b98..a2c8f92751 100644
--- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -300,7 +300,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -310,10 +310,10 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     mgDebug( 1, __FUNCTION__ );
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 816b17272d..6bb0010b24 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006091594696044922 [0m
+[1;32mDEBUG: model prefixing  takes 0.005433559417724609 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,45 +151,45 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.009 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.141 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.544s
-user	0m0.472s
-sys	0m0.060s
+real	0m0.547s
+user	0m0.487s
+sys	0m0.055s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
index 5c7a133eed..ede8c7f653 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,294 +279,139 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // *** DIAGRAM 1 OF 3 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 3 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
 
-      // *** DIAGRAM 3 OF 3 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 3 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -552,7 +450,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -585,6 +487,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -625,6 +531,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -727,26 +637,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -754,25 +664,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -785,7 +710,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -795,26 +720,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -825,17 +751,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -872,35 +804,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -917,13 +1031,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -935,17 +1043,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -971,93 +1082,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1099,7 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1122,7 +1203,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1131,25 +1212,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1159,8 +1246,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1176,11 +1265,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1282,14 +1372,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
index feff1cc6e1..a55660afd2 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 3;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagrams.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagrams.h
new file mode 100644
index 0000000000..173f24d4cf
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/diagrams.h
@@ -0,0 +1,106 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 3 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 3 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 3 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
index febf1dcf42..5ca71ae17f 100644
--- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -962,7 +962,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -974,7 +974,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -987,7 +987,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1006,7 +1006,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1019,7 +1019,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
index d09f387480..e6b63799bd 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
index ba434e7b98..a2c8f92751 100644
--- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -300,7 +300,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -310,10 +310,10 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     mgDebug( 1, __FUNCTION__ );
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 6466d14e6d..fad65df6b1 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00611424446105957 [0m
+[1;32mDEBUG: model prefixing  takes 0.005459308624267578 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -159,21 +159,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.019 s
+1 processes with 16 diagrams generated in 0.020 s
 Total: 2 processes with 19 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Processing color information for process: g g > t t~ g @2 
@@ -187,9 +187,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -198,25 +198,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s
-Wrote files for 46 helas calls in 0.184 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1665][0m [0m
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
+Wrote files for 46 helas calls in 0.191 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.322 s
+ALOHA: aloha creates 5 routines in  0.319 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.311 s
+ALOHA: aloha creates 10 routines in  0.303 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -226,41 +226,41 @@ ALOHA: aloha creates 10 routines in  0.311 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 243 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.583s
-user	0m2.278s
-sys	0m0.302s
-Code generation completed in 3 seconds
+real	0m2.626s
+user	0m2.304s
+sys	0m0.321s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -273,7 +273,7 @@ Code generation completed in 3 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -281,9 +281,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -303,7 +303,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -311,9 +311,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index e50becb2d9..4b103d9e55 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
index 1711d30371..d4c7c73e61 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
index 364dbd21b0..730a05e322 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/makefile b/epochX/cudacpp/gg_tt01g.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/makefile
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc
index 2588190439..e169c1f193 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index b32f4b931e..ede8c7f653 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,297 +279,139 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // *** DIAGRAM 1 OF 3 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 3 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
 
-      // *** DIAGRAM 3 OF 3 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 3 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -555,7 +450,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +487,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +531,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -730,26 +637,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +664,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +710,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +720,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -828,17 +751,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,35 +804,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -913,20 +1024,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,17 +1043,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -974,93 +1082,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1203,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,25 +1212,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1162,8 +1246,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1265,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1372,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index feff1cc6e1..a55660afd2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 3;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index bc9bcfeb9b..ce175a75a8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index db3c284caa..bd3d520785 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagrams.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagrams.h
new file mode 100644
index 0000000000..962978409f
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/diagrams.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 3 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 3 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 3 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f
index ec5722702a..c08048ad0e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
index 707ea40323..97656450ad 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
index c8b3dbf03c..0fa180cdf8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,500 +279,165 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 16 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 2 OF 16 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 16 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 16 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 5 OF 16 ***
-
-      // Wavefunction(s) for diagram number 5
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 16 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 7 OF 16 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 8 OF 16 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 16 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 10 OF 16 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 16 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 16 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 13 OF 16 ***
 
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 14 OF 16 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 15 OF 16 ***
 
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 16 ***
-
-      // Wavefunction(s) for diagram number 16
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 16 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -774,7 +492,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -808,6 +530,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -849,6 +575,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -951,26 +681,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -978,25 +708,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1009,7 +754,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1019,26 +764,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1049,17 +795,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1096,35 +848,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1134,20 +1068,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1159,17 +1087,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1195,93 +1126,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1323,7 +1224,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1346,7 +1247,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1355,25 +1256,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1383,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1400,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1506,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
index b583fc85fe..a3fb48fbb8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 18;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
index 8843b88a23..81d6a09df0 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
index b22dde0f92..8668bbcb4d 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..b76aa16029
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagrams.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagrams.h
new file mode 100644
index 0000000000..8ea15aedfa
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/diagrams.h
@@ -0,0 +1,515 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 16 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 16 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 16 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 16 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 16 ***
+    // Wavefunction(s) for diagram number 5
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 16 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 16 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 16 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 16 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 16 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 16 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 16 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 16 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 16 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 16 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 16 ***
+    // Wavefunction(s) for diagram number 16
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
index 7d44ae130e..76a34107bc 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -323,7 +323,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +366,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(9)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -409,43 +410,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /7.111111111111111D+00,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D
-     $ +00/
+      DATA DENOM/9/
+      DATA (CF(I),I=  1,  6) /64,-16,-16,2,2,20/
 C     1 T(1,2,5,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D
-     $ +00,-8.888888888888888D-01,1.111111111111111D-01/
+      DATA (CF(I),I=  7, 11) /64,2,20,-16,2/
 C     1 T(1,5,2,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-8.888888888888888D-01
-     $ ,1.111111111111111D-01,7.111111111111111D+00,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01/
+      DATA (CF(I),I= 12, 15) /64,-16,20,2/
 C     1 T(2,1,5,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.111111111111111D-01
-     $ ,1.111111111111111D+00,-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,
-     $ -8.888888888888888D-01/
+      DATA (CF(I),I= 16, 18) /64,2,-16/
 C     1 T(2,5,1,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.111111111111111D-01,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01,7.111111111111111D+00,-8.888888888888888D-01/
+      DATA (CF(I),I= 19, 20) /64,-16/
 C     1 T(5,1,2,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.111111111111111D+00
-     $ ,1.111111111111111D-01,1.111111111111111D-01,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,7.111111111111111D+00/
+      DATA (CF(I),I= 21, 21) /64/
 C     1 T(5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -549,10 +539,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -561,6 +553,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/madevent b/epochX/cudacpp/gg_tt01g.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/madevent
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
index ff9f0d7f00..cb66251689 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -950,7 +950,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -964,7 +964,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -978,7 +978,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -991,7 +991,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1014,7 +1014,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1027,7 +1027,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1052,7 +1052,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1065,7 +1065,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1077,7 +1077,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1090,7 +1090,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1109,7 +1109,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1122,7 +1122,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1141,7 +1141,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1154,7 +1154,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1172,7 +1172,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1187,7 +1187,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1207,7 +1207,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1222,7 +1222,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1242,7 +1242,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1257,7 +1257,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index c216de0edd..ed4804611b 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006003379821777344 [0m
+[1;32mDEBUG: model prefixing  takes 0.005472660064697266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.021 s
+1 processes with 16 diagrams generated in 0.022 s
 Total: 1 processes with 16 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
@@ -177,9 +177,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1665][0m [0m
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
 Wrote files for 36 helas calls in 0.123 s
 ALOHA: aloha starts to compute helicity amplitudes
@@ -188,14 +188,14 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.331 s
+ALOHA: aloha creates 5 routines in  0.316 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.310 s
+ALOHA: aloha creates 10 routines in  0.298 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -205,37 +205,37 @@ ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 243 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.439s
-user	0m2.135s
-sys	0m0.297s
+real	0m2.462s
+user	0m2.188s
+sys	0m0.274s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -249,7 +249,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -257,9 +257,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -279,7 +279,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -287,9 +287,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index 3ace6e558c..36bb202386 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
index d087670827..a16ea5dee6 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
index 43e93cbf40..cdcd77f36d 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_ttg.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/makefile b/epochX/cudacpp/gg_ttg.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttg.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_ttg.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 5de1c626c8..1db10f1e09 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,500 +279,165 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 16 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 2 OF 16 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 16 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 16 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 5 OF 16 ***
-
-      // Wavefunction(s) for diagram number 5
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 16 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 7 OF 16 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 8 OF 16 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 16 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 10 OF 16 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 16 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 16 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 13 OF 16 ***
 
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 14 OF 16 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 15 OF 16 ***
 
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 16 ***
-
-      // Wavefunction(s) for diagram number 16
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 16 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -774,7 +492,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -808,6 +530,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -849,6 +575,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -951,26 +681,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -978,25 +708,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1009,7 +754,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1019,26 +764,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1049,17 +795,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1096,35 +848,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1134,20 +1068,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1159,17 +1087,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1195,93 +1126,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1323,7 +1224,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1346,7 +1247,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1355,25 +1256,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1383,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1400,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1506,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 2acfa000a7..8e87baf8e2 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 18;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index 10496aa04d..163076da52 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 7c8695090c..bc9333bb5d 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..b76aa16029
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagrams.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagrams.h
new file mode 100644
index 0000000000..8ea15aedfa
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/diagrams.h
@@ -0,0 +1,515 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 16 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 16 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 16 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 16 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 16 ***
+    // Wavefunction(s) for diagram number 5
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 16 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 16 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 16 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 16 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 16 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 16 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 16 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 16 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 16 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 16 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 16 ***
+    // Wavefunction(s) for diagram number 16
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index 797b19405d..850d121618 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -323,7 +323,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +366,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(9)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -409,43 +410,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /7.111111111111111D+00,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D
-     $ +00/
+      DATA DENOM/9/
+      DATA (CF(I),I=  1,  6) /64,-16,-16,2,2,20/
 C     1 T(1,2,5,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D
-     $ +00,-8.888888888888888D-01,1.111111111111111D-01/
+      DATA (CF(I),I=  7, 11) /64,2,20,-16,2/
 C     1 T(1,5,2,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-8.888888888888888D-01
-     $ ,1.111111111111111D-01,7.111111111111111D+00,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01/
+      DATA (CF(I),I= 12, 15) /64,-16,20,2/
 C     1 T(2,1,5,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.111111111111111D-01
-     $ ,1.111111111111111D+00,-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,
-     $ -8.888888888888888D-01/
+      DATA (CF(I),I= 16, 18) /64,2,-16/
 C     1 T(2,5,1,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.111111111111111D-01,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01,7.111111111111111D+00,-8.888888888888888D-01/
+      DATA (CF(I),I= 19, 20) /64,-16/
 C     1 T(5,1,2,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.111111111111111D+00
-     $ ,1.111111111111111D-01,1.111111111111111D-01,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,7.111111111111111D+00/
+      DATA (CF(I),I= 21, 21) /64/
 C     1 T(5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -549,10 +539,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -561,6 +553,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/madevent b/epochX/cudacpp/gg_ttg.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttg.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
index ff9f0d7f00..cb66251689 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -950,7 +950,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -964,7 +964,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -978,7 +978,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -991,7 +991,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1014,7 +1014,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1027,7 +1027,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1052,7 +1052,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1065,7 +1065,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1077,7 +1077,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1090,7 +1090,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1109,7 +1109,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1122,7 +1122,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1141,7 +1141,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1154,7 +1154,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1172,7 +1172,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1187,7 +1187,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1207,7 +1207,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1222,7 +1222,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1242,7 +1242,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1257,7 +1257,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 882c93c3a5..cd5159694a 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006311178207397461 [0m
+[1;32mDEBUG: model prefixing  takes 0.00543212890625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,21 +155,21 @@ INFO: Process has 16 diagrams
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
 Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
@@ -177,7 +177,7 @@ ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.323 s
+ALOHA: aloha creates 5 routines in  0.318 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -187,17 +187,17 @@ ALOHA: aloha creates 5 routines in  0.323 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.791s
-user	0m0.727s
-sys	0m0.049s
-Code generation completed in 0 seconds
+real	0m0.792s
+user	0m0.730s
+sys	0m0.045s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
index bf77ac9970..1db10f1e09 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,494 +279,165 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 16 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 2 OF 16 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 16 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 16 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 5 OF 16 ***
-
-      // Wavefunction(s) for diagram number 5
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 16 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 7 OF 16 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 8 OF 16 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 16 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 10 OF 16 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 16 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 16 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 13 OF 16 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 14 OF 16 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
 
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 15 OF 16 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 16 ***
-
-      // Wavefunction(s) for diagram number 16
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
 
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 16 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -768,7 +492,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -802,6 +530,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -843,6 +575,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -945,26 +681,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -972,25 +708,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1003,7 +754,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1013,26 +764,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1043,17 +795,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1090,35 +848,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1135,13 +1075,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1153,17 +1087,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1189,93 +1126,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1317,7 +1224,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1340,7 +1247,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1349,25 +1256,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1377,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1394,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1500,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
index 2acfa000a7..8e87baf8e2 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 18;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..b76aa16029
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagrams.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagrams.h
new file mode 100644
index 0000000000..91d334bc4e
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/diagrams.h
@@ -0,0 +1,509 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 16 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 16 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 16 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 16 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 16 ***
+    // Wavefunction(s) for diagram number 5
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 16 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 16 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 16 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 16 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 16 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 16 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 16 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 16 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 16 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 16 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 16 ***
+    // Wavefunction(s) for diagram number 16
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
index ff9f0d7f00..cb66251689 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -950,7 +950,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -964,7 +964,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -978,7 +978,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -991,7 +991,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1014,7 +1014,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1027,7 +1027,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1052,7 +1052,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1065,7 +1065,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1077,7 +1077,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1090,7 +1090,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1109,7 +1109,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1122,7 +1122,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1141,7 +1141,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1154,7 +1154,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1172,7 +1172,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1187,7 +1187,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1207,7 +1207,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1222,7 +1222,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1242,7 +1242,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1257,7 +1257,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttg.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 78cdfd68b2..4b41dc7b62 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006547212600708008 [0m
+[1;32mDEBUG: model prefixing  takes 0.005442380905151367 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,21 +151,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.171 s
+1 processes with 123 diagrams generated in 0.158 s
 Total: 1 processes with 123 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
@@ -177,25 +177,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
-Wrote files for 222 helas calls in 0.660 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1665][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.424 s
+Wrote files for 222 helas calls in 0.655 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.328 s
+ALOHA: aloha creates 5 routines in  0.327 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.313 s
+ALOHA: aloha creates 10 routines in  0.306 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -208,38 +208,38 @@ ALOHA: aloha creates 10 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 275 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.934s
-user	0m3.516s
-sys	0m0.277s
-Code generation completed in 5 seconds
+real	0m3.826s
+user	0m3.524s
+sys	0m0.296s
+Code generation completed in 4 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -252,7 +252,7 @@ Code generation completed in 5 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -260,9 +260,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -282,7 +282,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -290,9 +290,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index 1fa5e235b3..dcbb38ba34 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
index ecdc7fd25c..964b954d74 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
index 7ec841d6c2..308f5bed4f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/makefile b/epochX/cudacpp/gg_ttgg.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
index c508e73f26..5e1fba0c34 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 24;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,2412 +279,379 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 123 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 1
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 123 ***
-
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 123 ***
-
-      // Wavefunction(s) for diagram number 3
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 123 ***
-
-      // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 123 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 123 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 7 OF 123 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 123 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 123 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 10 OF 123 ***
-
-      // Wavefunction(s) for diagram number 10
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 123 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 123 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 13 OF 123 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 123 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 15 OF 123 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 123 ***
-
-      // Wavefunction(s) for diagram number 16
-      // (none)
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 17 OF 123 ***
-
-      // Wavefunction(s) for diagram number 17
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 18 OF 123 ***
-
-      // Wavefunction(s) for diagram number 18
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 19 OF 123 ***
-
-      // Wavefunction(s) for diagram number 19
-      // (none)
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 123 ***
-
-      // Wavefunction(s) for diagram number 20
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 21 OF 123 ***
-
-      // Wavefunction(s) for diagram number 21
-      // (none)
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 123 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 123 ***
-
-      // Wavefunction(s) for diagram number 23
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
-
-      // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 24 OF 123 ***
-
-      // Wavefunction(s) for diagram number 24
-      // (none)
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 123 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 123 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 27 OF 123 ***
-
-      // Wavefunction(s) for diagram number 27
-      // (none)
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 28 OF 123 ***
-
-      // Wavefunction(s) for diagram number 28
-      // (none)
-
-      // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 29 OF 123 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 30 OF 123 ***
-
-      // Wavefunction(s) for diagram number 30
-      // (none)
-
-      // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 123 ***
-
-      // Wavefunction(s) for diagram number 31
-      // (none)
-
-      // Amplitude(s) for diagram number 31
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 32 OF 123 ***
-
-      // Wavefunction(s) for diagram number 32
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 33 OF 123 ***
-
-      // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 34 OF 123 ***
-
-      // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] -= amp_sv[0];
-
-      // *** DIAGRAM 35 OF 123 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 123 ***
-
-      // Wavefunction(s) for diagram number 36
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 37 OF 123 ***
-
-      // Wavefunction(s) for diagram number 37
-      // (none)
-
-      // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 38 OF 123 ***
-
-      // Wavefunction(s) for diagram number 38
-      // (none)
-
-      // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 39 OF 123 ***
-
-      // Wavefunction(s) for diagram number 39
-      // (none)
-
-      // Amplitude(s) for diagram number 39
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 40 OF 123 ***
 
-      // Wavefunction(s) for diagram number 40
-      // (none)
-
-      // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 41 OF 123 ***
-
-      // Wavefunction(s) for diagram number 41
-      // (none)
-
-      // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 42 OF 123 ***
-
-      // Wavefunction(s) for diagram number 42
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 43 OF 123 ***
-
-      // Wavefunction(s) for diagram number 43
-      // (none)
-
-      // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] -= amp_sv[0];
-
-      // *** DIAGRAM 44 OF 123 ***
-
-      // Wavefunction(s) for diagram number 44
-      // (none)
-
-      // Amplitude(s) for diagram number 44
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 45 OF 123 ***
-
-      // Wavefunction(s) for diagram number 45
-      // (none)
-
-      // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 46 OF 123 ***
-
-      // Wavefunction(s) for diagram number 46
-      // (none)
-
-      // Amplitude(s) for diagram number 46
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 47 OF 123 ***
-
-      // Wavefunction(s) for diagram number 47
-      // (none)
-
-      // Amplitude(s) for diagram number 47
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 48 OF 123 ***
-
-      // Wavefunction(s) for diagram number 48
-      // (none)
-
-      // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 49 OF 123 ***
-
-      // Wavefunction(s) for diagram number 49
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 50 OF 123 ***
-
-      // Wavefunction(s) for diagram number 50
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 51 OF 123 ***
-
-      // Wavefunction(s) for diagram number 51
-      // (none)
-
-      // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 52 OF 123 ***
-
-      // Wavefunction(s) for diagram number 52
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 53 OF 123 ***
-
-      // Wavefunction(s) for diagram number 53
-      // (none)
-
-      // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 54 OF 123 ***
-
-      // Wavefunction(s) for diagram number 54
-      // (none)
-
-      // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 55 OF 123 ***
-
-      // Wavefunction(s) for diagram number 55
-      // (none)
-
-      // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-
-      // *** DIAGRAM 56 OF 123 ***
-
-      // Wavefunction(s) for diagram number 56
-      // (none)
-
-      // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-
-      // *** DIAGRAM 57 OF 123 ***
-
-      // Wavefunction(s) for diagram number 57
-      // (none)
-
-      // Amplitude(s) for diagram number 57
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 58 OF 123 ***
-
-      // Wavefunction(s) for diagram number 58
-      // (none)
-
-      // Amplitude(s) for diagram number 58
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 59 OF 123 ***
-
-      // Wavefunction(s) for diagram number 59
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 59
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 60 OF 123 ***
-
-      // Wavefunction(s) for diagram number 60
-      // (none)
-
-      // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 61 OF 123 ***
-
-      // Wavefunction(s) for diagram number 61
-      // (none)
-
-      // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 62 OF 123 ***
-
-      // Wavefunction(s) for diagram number 62
-      // (none)
-
-      // Amplitude(s) for diagram number 62
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 63 OF 123 ***
-
-      // Wavefunction(s) for diagram number 63
-      // (none)
-
-      // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 64 OF 123 ***
-
-      // Wavefunction(s) for diagram number 64
-      // (none)
-
-      // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 65 OF 123 ***
-
-      // Wavefunction(s) for diagram number 65
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 66 OF 123 ***
-
-      // Wavefunction(s) for diagram number 66
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 67 OF 123 ***
-
-      // Wavefunction(s) for diagram number 67
-      // (none)
-
-      // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 68 OF 123 ***
-
-      // Wavefunction(s) for diagram number 68
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 69 OF 123 ***
-
-      // Wavefunction(s) for diagram number 69
-      // (none)
-
-      // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 70 OF 123 ***
-
-      // Wavefunction(s) for diagram number 70
-      // (none)
-
-      // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 71 OF 123 ***
-
-      // Wavefunction(s) for diagram number 71
-      // (none)
-
-      // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-
-      // *** DIAGRAM 72 OF 123 ***
-
-      // Wavefunction(s) for diagram number 72
-      // (none)
-
-      // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-
-      // *** DIAGRAM 73 OF 123 ***
-
-      // Wavefunction(s) for diagram number 73
-      // (none)
-
-      // Amplitude(s) for diagram number 73
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 74 OF 123 ***
-
-      // Wavefunction(s) for diagram number 74
-      // (none)
-
-      // Amplitude(s) for diagram number 74
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 75 OF 123 ***
-
-      // Wavefunction(s) for diagram number 75
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 75
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 76 OF 123 ***
-
-      // Wavefunction(s) for diagram number 76
-      // (none)
-
-      // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 77 OF 123 ***
-
-      // Wavefunction(s) for diagram number 77
-      // (none)
-
-      // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 78 OF 123 ***
-
-      // Wavefunction(s) for diagram number 78
-      // (none)
-
-      // Amplitude(s) for diagram number 78
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 79 OF 123 ***
-
-      // Wavefunction(s) for diagram number 79
-      // (none)
-
-      // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 80 OF 123 ***
-
-      // Wavefunction(s) for diagram number 80
-      // (none)
-
-      // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 81 OF 123 ***
-
-      // Wavefunction(s) for diagram number 81
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= amp_sv[0];
-
-      // *** DIAGRAM 82 OF 123 ***
-
-      // Wavefunction(s) for diagram number 82
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 83 OF 123 ***
-
-      // Wavefunction(s) for diagram number 83
-      // (none)
-
-      // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= amp_sv[0];
-
-      // *** DIAGRAM 84 OF 123 ***
-
-      // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= amp_sv[0];
-
-      // *** DIAGRAM 85 OF 123 ***
-
-      // Wavefunction(s) for diagram number 85
-      // (none)
-
-      // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 86 OF 123 ***
-
-      // Wavefunction(s) for diagram number 86
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 86
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 87 OF 123 ***
-
-      // Wavefunction(s) for diagram number 87
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 88 OF 123 ***
-
-      // Wavefunction(s) for diagram number 88
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 88
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] -= amp_sv[0];
-
-      // *** DIAGRAM 89 OF 123 ***
-
-      // Wavefunction(s) for diagram number 89
-      // (none)
-
-      // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 90 OF 123 ***
-
-      // Wavefunction(s) for diagram number 90
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
-
-      // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 91 OF 123 ***
-
-      // Wavefunction(s) for diagram number 91
-      // (none)
-
-      // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 92 OF 123 ***
-
-      // Wavefunction(s) for diagram number 92
-      // (none)
-
-      // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 93 OF 123 ***
-
-      // Wavefunction(s) for diagram number 93
-      // (none)
-
-      // Amplitude(s) for diagram number 93
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 94 OF 123 ***
-
-      // Wavefunction(s) for diagram number 94
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 94
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 95 OF 123 ***
-
-      // Wavefunction(s) for diagram number 95
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
-
-      // Amplitude(s) for diagram number 95
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 96 OF 123 ***
-
-      // Wavefunction(s) for diagram number 96
-      // (none)
-
-      // Amplitude(s) for diagram number 96
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 97 OF 123 ***
-
-      // Wavefunction(s) for diagram number 97
-      // (none)
-
-      // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 98 OF 123 ***
-
-      // Wavefunction(s) for diagram number 98
-      // (none)
-
-      // Amplitude(s) for diagram number 98
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 99 OF 123 ***
-
-      // Wavefunction(s) for diagram number 99
-      // (none)
-
-      // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 100 OF 123 ***
-
-      // Wavefunction(s) for diagram number 100
-      // (none)
-
-      // Amplitude(s) for diagram number 100
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 101 OF 123 ***
-
-      // Wavefunction(s) for diagram number 101
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 102 OF 123 ***
-
-      // Wavefunction(s) for diagram number 102
-      // (none)
-
-      // Amplitude(s) for diagram number 102
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 103 OF 123 ***
-
-      // Wavefunction(s) for diagram number 103
-      // (none)
-
-      // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 104 OF 123 ***
-
-      // Wavefunction(s) for diagram number 104
-      // (none)
-
-      // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 105 OF 123 ***
-
-      // Wavefunction(s) for diagram number 105
-      // (none)
-
-      // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 106 OF 123 ***
-
-      // Wavefunction(s) for diagram number 106
-      // (none)
-
-      // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 107 OF 123 ***
-
-      // Wavefunction(s) for diagram number 107
-      // (none)
-
-      // Amplitude(s) for diagram number 107
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 108 OF 123 ***
-
-      // Wavefunction(s) for diagram number 108
-      // (none)
-
-      // Amplitude(s) for diagram number 108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 109 OF 123 ***
-
-      // Wavefunction(s) for diagram number 109
-      // (none)
-
-      // Amplitude(s) for diagram number 109
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 110 OF 123 ***
-
-      // Wavefunction(s) for diagram number 110
-      // (none)
-
-      // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 111 OF 123 ***
-
-      // Wavefunction(s) for diagram number 111
-      // (none)
-
-      // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 112 OF 123 ***
-
-      // Wavefunction(s) for diagram number 112
-      // (none)
-
-      // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 113 OF 123 ***
-
-      // Wavefunction(s) for diagram number 113
-      // (none)
-
-      // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 114 OF 123 ***
-
-      // Wavefunction(s) for diagram number 114
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 114
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 115 OF 123 ***
-
-      // Wavefunction(s) for diagram number 115
-      // (none)
-
-      // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 116 OF 123 ***
-
-      // Wavefunction(s) for diagram number 116
-      // (none)
-
-      // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 117 OF 123 ***
-
-      // Wavefunction(s) for diagram number 117
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 117
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 118 OF 123 ***
-
-      // Wavefunction(s) for diagram number 118
-      // (none)
-
-      // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 119 OF 123 ***
-
-      // Wavefunction(s) for diagram number 119
-      // (none)
-
-      // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 120 OF 123 ***
-
-      // Wavefunction(s) for diagram number 120
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
-
-      // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 121 OF 123 ***
-
-      // Wavefunction(s) for diagram number 121
-      // (none)
-
-      // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 122 OF 123 ***
-
-      // Wavefunction(s) for diagram number 122
-      // (none)
-
-      // Amplitude(s) for diagram number 122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 123 OF 123 ***
-
-      // Wavefunction(s) for diagram number 123
-      // (none)
-
-      // Amplitude(s) for diagram number 123
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
-
-      // The color matrix (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
-        { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
-        { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
-        { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
-        { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
-        { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
-        { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
-        { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
-        { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
-        { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
-        { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
-        { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
-        { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
-        { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
-        { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
-        { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
-        { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
-        { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
-        { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
-        { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
-        { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
-        { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
-        { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
-        { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 123 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -2718,7 +738,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -2753,6 +777,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -2795,6 +823,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -2897,26 +929,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -2924,25 +956,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -2955,7 +1002,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -2965,26 +1012,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -2995,17 +1043,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -3042,35 +1096,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -3080,20 +1316,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -3105,17 +1335,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -3141,93 +1374,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3269,7 +1472,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -3292,7 +1495,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -3301,25 +1504,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -3329,8 +1538,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -3346,11 +1557,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -3452,14 +1664,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
index 2b75e0f842..f142e7ef7d 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 159;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
index c087f3f747..a2dfd42919 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
index ce5493be9b..2d8197d859 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..02db3d0204
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.cc
@@ -0,0 +1,405 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
+
+  // The color matrix (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
+    { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
+    { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
+    { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
+    { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
+    { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
+    { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
+    { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
+    { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
+    { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
+    { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
+    { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
+    { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
+    { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
+    { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
+    { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
+    { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
+    { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
+    { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
+    { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
+    { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
+    { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
+    { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
+    { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagrams.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagrams.h
new file mode 100644
index 0000000000..08f07c1187
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/diagrams.h
@@ -0,0 +1,4120 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 123 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 1
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 123 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 123 ***
+    // Wavefunction(s) for diagram number 3
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 123 ***
+    // Wavefunction(s) for diagram number 4
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 4
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 123 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 123 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 123 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 123 ***
+    // Wavefunction(s) for diagram number 8
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 123 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 123 ***
+    // Wavefunction(s) for diagram number 10
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 123 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 123 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 123 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 123 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 123 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 123 ***
+    // Wavefunction(s) for diagram number 16
+    // (none)
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 123 ***
+    // Wavefunction(s) for diagram number 17
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 123 ***
+    // Wavefunction(s) for diagram number 18
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 18
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 123 ***
+    // Wavefunction(s) for diagram number 19
+    // (none)
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 123 ***
+    // Wavefunction(s) for diagram number 20
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 123 ***
+    // Wavefunction(s) for diagram number 21
+    // (none)
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 123 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 123 ***
+    // Wavefunction(s) for diagram number 23
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
+    // Amplitude(s) for diagram number 23
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 123 ***
+    // Wavefunction(s) for diagram number 24
+    // (none)
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 123 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 123 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 123 ***
+    // Wavefunction(s) for diagram number 27
+    // (none)
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 123 ***
+    // Wavefunction(s) for diagram number 28
+    // (none)
+    // Amplitude(s) for diagram number 28
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 123 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 123 ***
+    // Wavefunction(s) for diagram number 30
+    // (none)
+    // Amplitude(s) for diagram number 30
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 123 ***
+    // Wavefunction(s) for diagram number 31
+    // (none)
+    // Amplitude(s) for diagram number 31
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 123 ***
+    // Wavefunction(s) for diagram number 32
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 123 ***
+    // Wavefunction(s) for diagram number 33
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 123 ***
+    // Wavefunction(s) for diagram number 34
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 34
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 123 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 123 ***
+    // Wavefunction(s) for diagram number 36
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 36
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram37( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 37 OF 123 ***
+    // Wavefunction(s) for diagram number 37
+    // (none)
+    // Amplitude(s) for diagram number 37
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram38( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 38 OF 123 ***
+    // Wavefunction(s) for diagram number 38
+    // (none)
+    // Amplitude(s) for diagram number 38
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram39( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 39 OF 123 ***
+    // Wavefunction(s) for diagram number 39
+    // (none)
+    // Amplitude(s) for diagram number 39
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram40( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 40 OF 123 ***
+    // Wavefunction(s) for diagram number 40
+    // (none)
+    // Amplitude(s) for diagram number 40
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram41( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 41 OF 123 ***
+    // Wavefunction(s) for diagram number 41
+    // (none)
+    // Amplitude(s) for diagram number 41
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram42( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 42 OF 123 ***
+    // Wavefunction(s) for diagram number 42
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 42
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram43( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 43 OF 123 ***
+    // Wavefunction(s) for diagram number 43
+    // (none)
+    // Amplitude(s) for diagram number 43
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram44( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 44 OF 123 ***
+    // Wavefunction(s) for diagram number 44
+    // (none)
+    // Amplitude(s) for diagram number 44
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram45( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 45 OF 123 ***
+    // Wavefunction(s) for diagram number 45
+    // (none)
+    // Amplitude(s) for diagram number 45
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram46( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 46 OF 123 ***
+    // Wavefunction(s) for diagram number 46
+    // (none)
+    // Amplitude(s) for diagram number 46
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram47( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 47 OF 123 ***
+    // Wavefunction(s) for diagram number 47
+    // (none)
+    // Amplitude(s) for diagram number 47
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram48( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 48 OF 123 ***
+    // Wavefunction(s) for diagram number 48
+    // (none)
+    // Amplitude(s) for diagram number 48
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram49( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 49 OF 123 ***
+    // Wavefunction(s) for diagram number 49
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 49
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram50( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 50 OF 123 ***
+    // Wavefunction(s) for diagram number 50
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 50
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram51( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 51 OF 123 ***
+    // Wavefunction(s) for diagram number 51
+    // (none)
+    // Amplitude(s) for diagram number 51
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram52( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 52 OF 123 ***
+    // Wavefunction(s) for diagram number 52
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 52
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram53( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 53 OF 123 ***
+    // Wavefunction(s) for diagram number 53
+    // (none)
+    // Amplitude(s) for diagram number 53
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram54( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 54 OF 123 ***
+    // Wavefunction(s) for diagram number 54
+    // (none)
+    // Amplitude(s) for diagram number 54
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram55( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 55 OF 123 ***
+    // Wavefunction(s) for diagram number 55
+    // (none)
+    // Amplitude(s) for diagram number 55
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram56( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 56 OF 123 ***
+    // Wavefunction(s) for diagram number 56
+    // (none)
+    // Amplitude(s) for diagram number 56
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram57( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 57 OF 123 ***
+    // Wavefunction(s) for diagram number 57
+    // (none)
+    // Amplitude(s) for diagram number 57
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram58( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 58 OF 123 ***
+    // Wavefunction(s) for diagram number 58
+    // (none)
+    // Amplitude(s) for diagram number 58
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram59( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 59 OF 123 ***
+    // Wavefunction(s) for diagram number 59
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 59
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram60( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 60 OF 123 ***
+    // Wavefunction(s) for diagram number 60
+    // (none)
+    // Amplitude(s) for diagram number 60
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram61( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 61 OF 123 ***
+    // Wavefunction(s) for diagram number 61
+    // (none)
+    // Amplitude(s) for diagram number 61
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram62( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 62 OF 123 ***
+    // Wavefunction(s) for diagram number 62
+    // (none)
+    // Amplitude(s) for diagram number 62
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram63( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 63 OF 123 ***
+    // Wavefunction(s) for diagram number 63
+    // (none)
+    // Amplitude(s) for diagram number 63
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram64( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 64 OF 123 ***
+    // Wavefunction(s) for diagram number 64
+    // (none)
+    // Amplitude(s) for diagram number 64
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram65( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 65 OF 123 ***
+    // Wavefunction(s) for diagram number 65
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 65
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram66( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 66 OF 123 ***
+    // Wavefunction(s) for diagram number 66
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 66
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram67( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 67 OF 123 ***
+    // Wavefunction(s) for diagram number 67
+    // (none)
+    // Amplitude(s) for diagram number 67
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram68( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 68 OF 123 ***
+    // Wavefunction(s) for diagram number 68
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 68
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram69( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 69 OF 123 ***
+    // Wavefunction(s) for diagram number 69
+    // (none)
+    // Amplitude(s) for diagram number 69
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram70( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 70 OF 123 ***
+    // Wavefunction(s) for diagram number 70
+    // (none)
+    // Amplitude(s) for diagram number 70
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram71( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 71 OF 123 ***
+    // Wavefunction(s) for diagram number 71
+    // (none)
+    // Amplitude(s) for diagram number 71
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram72( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 72 OF 123 ***
+    // Wavefunction(s) for diagram number 72
+    // (none)
+    // Amplitude(s) for diagram number 72
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram73( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 73 OF 123 ***
+    // Wavefunction(s) for diagram number 73
+    // (none)
+    // Amplitude(s) for diagram number 73
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram74( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 74 OF 123 ***
+    // Wavefunction(s) for diagram number 74
+    // (none)
+    // Amplitude(s) for diagram number 74
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram75( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 75 OF 123 ***
+    // Wavefunction(s) for diagram number 75
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 75
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram76( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 76 OF 123 ***
+    // Wavefunction(s) for diagram number 76
+    // (none)
+    // Amplitude(s) for diagram number 76
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram77( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 77 OF 123 ***
+    // Wavefunction(s) for diagram number 77
+    // (none)
+    // Amplitude(s) for diagram number 77
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram78( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 78 OF 123 ***
+    // Wavefunction(s) for diagram number 78
+    // (none)
+    // Amplitude(s) for diagram number 78
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram79( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 79 OF 123 ***
+    // Wavefunction(s) for diagram number 79
+    // (none)
+    // Amplitude(s) for diagram number 79
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram80( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 80 OF 123 ***
+    // Wavefunction(s) for diagram number 80
+    // (none)
+    // Amplitude(s) for diagram number 80
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram81( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 81 OF 123 ***
+    // Wavefunction(s) for diagram number 81
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 81
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram82( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 82 OF 123 ***
+    // Wavefunction(s) for diagram number 82
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 82
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram83( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 83 OF 123 ***
+    // Wavefunction(s) for diagram number 83
+    // (none)
+    // Amplitude(s) for diagram number 83
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram84( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 84 OF 123 ***
+    // Wavefunction(s) for diagram number 84
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 84
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram85( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 85 OF 123 ***
+    // Wavefunction(s) for diagram number 85
+    // (none)
+    // Amplitude(s) for diagram number 85
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram86( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 86 OF 123 ***
+    // Wavefunction(s) for diagram number 86
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 86
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram87( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 87 OF 123 ***
+    // Wavefunction(s) for diagram number 87
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 87
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram88( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 88 OF 123 ***
+    // Wavefunction(s) for diagram number 88
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 88
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram89( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 89 OF 123 ***
+    // Wavefunction(s) for diagram number 89
+    // (none)
+    // Amplitude(s) for diagram number 89
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram90( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 90 OF 123 ***
+    // Wavefunction(s) for diagram number 90
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
+    // Amplitude(s) for diagram number 90
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram91( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 91 OF 123 ***
+    // Wavefunction(s) for diagram number 91
+    // (none)
+    // Amplitude(s) for diagram number 91
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram92( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 92 OF 123 ***
+    // Wavefunction(s) for diagram number 92
+    // (none)
+    // Amplitude(s) for diagram number 92
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram93( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 93 OF 123 ***
+    // Wavefunction(s) for diagram number 93
+    // (none)
+    // Amplitude(s) for diagram number 93
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram94( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 94 OF 123 ***
+    // Wavefunction(s) for diagram number 94
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 94
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram95( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 95 OF 123 ***
+    // Wavefunction(s) for diagram number 95
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
+    // Amplitude(s) for diagram number 95
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram96( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 96 OF 123 ***
+    // Wavefunction(s) for diagram number 96
+    // (none)
+    // Amplitude(s) for diagram number 96
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram97( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 97 OF 123 ***
+    // Wavefunction(s) for diagram number 97
+    // (none)
+    // Amplitude(s) for diagram number 97
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram98( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 98 OF 123 ***
+    // Wavefunction(s) for diagram number 98
+    // (none)
+    // Amplitude(s) for diagram number 98
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram99( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 99 OF 123 ***
+    // Wavefunction(s) for diagram number 99
+    // (none)
+    // Amplitude(s) for diagram number 99
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram100( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 100 OF 123 ***
+    // Wavefunction(s) for diagram number 100
+    // (none)
+    // Amplitude(s) for diagram number 100
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram101( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 101 OF 123 ***
+    // Wavefunction(s) for diagram number 101
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 101
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram102( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 102 OF 123 ***
+    // Wavefunction(s) for diagram number 102
+    // (none)
+    // Amplitude(s) for diagram number 102
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram103( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 103 OF 123 ***
+    // Wavefunction(s) for diagram number 103
+    // (none)
+    // Amplitude(s) for diagram number 103
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram104( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 104 OF 123 ***
+    // Wavefunction(s) for diagram number 104
+    // (none)
+    // Amplitude(s) for diagram number 104
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram105( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 105 OF 123 ***
+    // Wavefunction(s) for diagram number 105
+    // (none)
+    // Amplitude(s) for diagram number 105
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram106( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 106 OF 123 ***
+    // Wavefunction(s) for diagram number 106
+    // (none)
+    // Amplitude(s) for diagram number 106
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram107( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 107 OF 123 ***
+    // Wavefunction(s) for diagram number 107
+    // (none)
+    // Amplitude(s) for diagram number 107
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram108( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 108 OF 123 ***
+    // Wavefunction(s) for diagram number 108
+    // (none)
+    // Amplitude(s) for diagram number 108
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram109( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 109 OF 123 ***
+    // Wavefunction(s) for diagram number 109
+    // (none)
+    // Amplitude(s) for diagram number 109
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram110( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 110 OF 123 ***
+    // Wavefunction(s) for diagram number 110
+    // (none)
+    // Amplitude(s) for diagram number 110
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram111( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 111 OF 123 ***
+    // Wavefunction(s) for diagram number 111
+    // (none)
+    // Amplitude(s) for diagram number 111
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram112( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 112 OF 123 ***
+    // Wavefunction(s) for diagram number 112
+    // (none)
+    // Amplitude(s) for diagram number 112
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram113( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 113 OF 123 ***
+    // Wavefunction(s) for diagram number 113
+    // (none)
+    // Amplitude(s) for diagram number 113
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram114( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 114 OF 123 ***
+    // Wavefunction(s) for diagram number 114
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 114
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram115( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 115 OF 123 ***
+    // Wavefunction(s) for diagram number 115
+    // (none)
+    // Amplitude(s) for diagram number 115
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram116( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 116 OF 123 ***
+    // Wavefunction(s) for diagram number 116
+    // (none)
+    // Amplitude(s) for diagram number 116
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram117( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 117 OF 123 ***
+    // Wavefunction(s) for diagram number 117
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 117
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram118( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 118 OF 123 ***
+    // Wavefunction(s) for diagram number 118
+    // (none)
+    // Amplitude(s) for diagram number 118
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram119( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 119 OF 123 ***
+    // Wavefunction(s) for diagram number 119
+    // (none)
+    // Amplitude(s) for diagram number 119
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram120( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 120 OF 123 ***
+    // Wavefunction(s) for diagram number 120
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+    // Amplitude(s) for diagram number 120
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram121( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 121 OF 123 ***
+    // Wavefunction(s) for diagram number 121
+    // (none)
+    // Amplitude(s) for diagram number 121
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram122( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 122 OF 123 ***
+    // Wavefunction(s) for diagram number 122
+    // (none)
+    // Amplitude(s) for diagram number 122
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram123( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 123 OF 123 ***
+    // Wavefunction(s) for diagram number 123
+    // (none)
+    // Amplitude(s) for diagram number 123
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
index 3ea53d8b21..7b6fa85360 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -398,7 +398,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(155)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -441,407 +442,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  1),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  1),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  1),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
+      DATA DENOM/54/
+      DATA (CF(I),I=  1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2
+     $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/
 C     1 T(1,2,5,6,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  2),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  2),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  2),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
+      DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2,
+     $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/
 C     1 T(1,2,6,5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  3),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  3),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  3),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124,
+     $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/
 C     1 T(1,5,2,6,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I,  4),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  4),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  4),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
+      DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16,
+     $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/
 C     1 T(1,5,6,2,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I,  5),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  5),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  5),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2
+     $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/
 C     1 T(1,6,2,5,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I,  6),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  6),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  6),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124
+     $ ,160,-20,16,-128,-2,-20,16,-2/
 C     1 T(1,6,5,2,3,4)
-      DATA (CF(I,  7),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  7),I=  7, 12) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  7),I= 13, 18) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I,  7),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2
+     $ ,124,142,-20,124,-2,-20,-56,124/
 C     1 T(2,1,5,6,3,4)
-      DATA (CF(I,  8),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  8),I=  7, 12) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  8),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  8),I= 19, 24) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
+      DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56
+     $ ,124,160,-20,16,-2,124,142/
 C     1 T(2,1,6,5,3,4)
-      DATA (CF(I,  9),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  9),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  9),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  9),I= 19, 24) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124
+     $ ,-56,-20,-2,124,-20/
 C     1 T(2,5,1,6,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I, 10),I=  7, 12) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
+      DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124
+     $ ,-2,16,-20,160/
 C     1 T(2,5,6,1,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I, 11),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2,
+     $ -128,16,-20,-2/
 C     1 T(2,6,1,5,3,4)
-      DATA (CF(I, 12),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 12),I=  7, 12) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128
+     $ ,-2,16/
 C     1 T(2,6,5,1,3,4)
-      DATA (CF(I, 13),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 13),I=  7, 12) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124,
+     $ -2,-20/
 C     1 T(5,1,2,6,3,4)
-      DATA (CF(I, 14),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 14),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
+      DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/
 C     1 T(5,1,6,2,3,4)
-      DATA (CF(I, 15),I=  1,  6) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 15),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/
 C     1 T(5,2,1,6,3,4)
-      DATA (CF(I, 16),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 16),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/
 C     1 T(5,2,6,1,3,4)
-      DATA (CF(I, 17),I=  1,  6) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 17),I=  7, 12) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/
 C     1 T(5,6,1,2,3,4)
-      DATA (CF(I, 18),I=  1,  6) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 18),I=  7, 12) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/
 C     1 T(5,6,2,1,3,4)
-      DATA (CF(I, 19),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 19),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
+      DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/
 C     1 T(6,1,2,5,3,4)
-      DATA (CF(I, 20),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 20),I=  7, 12) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
+      DATA (CF(I),I=286,290) /512,16,160,-128,16/
 C     1 T(6,1,5,2,3,4)
-      DATA (CF(I, 21),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 21),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
+      DATA (CF(I),I=291,294) /512,-128,160,16/
 C     1 T(6,2,1,5,3,4)
-      DATA (CF(I, 22),I=  1,  6) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 22),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=295,297) /512,16,-128/
 C     1 T(6,2,5,1,3,4)
-      DATA (CF(I, 23),I=  1,  6) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I=  7, 12) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
+      DATA (CF(I),I=298,299) /512,-128/
 C     1 T(6,5,1,2,3,4)
-      DATA (CF(I, 24),I=  1,  6) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 24),I=  7, 12) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
+      DATA (CF(I),I=300,300) /512/
 C     1 T(6,5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -1547,10 +1222,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -1559,6 +1236,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4))
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/madevent b/epochX/cudacpp/gg_ttgg.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
index 53dd560ed6..c30f753dcb 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -963,7 +963,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -976,7 +976,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -990,7 +990,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1003,7 +1003,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1017,7 +1017,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -1030,7 +1030,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1053,7 +1053,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1066,7 +1066,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1091,7 +1091,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1104,7 +1104,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1116,7 +1116,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1129,7 +1129,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1148,7 +1148,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1161,7 +1161,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1180,7 +1180,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1193,7 +1193,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1211,7 +1211,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1226,7 +1226,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
@@ -1241,7 +1241,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1256,7 +1256,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1276,7 +1276,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -1291,7 +1291,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1306,7 +1306,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1321,7 +1321,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1341,7 +1341,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1356,7 +1356,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1371,7 +1371,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1386,7 +1386,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 7e5a3007eb..a55f92c773 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0059185028076171875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005396842956542969 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.156 s
+1 processes with 123 diagrams generated in 0.158 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.316 s
+ALOHA: aloha creates 5 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -190,17 +190,17 @@ ALOHA: aloha creates 5 routines in  0.316 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.455s
-user	0m1.362s
+real	0m1.450s
+user	0m1.371s
 sys	0m0.060s
-Code generation completed in 1 seconds
+Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
index 5956559974..5e1fba0c34 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 24;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,2469 +279,379 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 123 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 1
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 123 ***
-
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 123 ***
-
-      // Wavefunction(s) for diagram number 3
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 123 ***
-
-      // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 123 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 123 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 7 OF 123 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 123 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 123 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 10 OF 123 ***
-
-      // Wavefunction(s) for diagram number 10
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 123 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 123 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 13 OF 123 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 123 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 15 OF 123 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 123 ***
-
-      // Wavefunction(s) for diagram number 16
-      // (none)
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 17 OF 123 ***
-
-      // Wavefunction(s) for diagram number 17
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 18 OF 123 ***
-
-      // Wavefunction(s) for diagram number 18
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 19 OF 123 ***
-
-      // Wavefunction(s) for diagram number 19
-      // (none)
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 123 ***
-
-      // Wavefunction(s) for diagram number 20
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 21 OF 123 ***
-
-      // Wavefunction(s) for diagram number 21
-      // (none)
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 123 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 123 ***
-
-      // Wavefunction(s) for diagram number 23
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
-
-      // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 24 OF 123 ***
-
-      // Wavefunction(s) for diagram number 24
-      // (none)
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 123 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 123 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 27 OF 123 ***
-
-      // Wavefunction(s) for diagram number 27
-      // (none)
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 28 OF 123 ***
-
-      // Wavefunction(s) for diagram number 28
-      // (none)
-
-      // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 29 OF 123 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 30 OF 123 ***
-
-      // Wavefunction(s) for diagram number 30
-      // (none)
-
-      // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 123 ***
-
-      // Wavefunction(s) for diagram number 31
-      // (none)
-
-      // Amplitude(s) for diagram number 31
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 32 OF 123 ***
-
-      // Wavefunction(s) for diagram number 32
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 33 OF 123 ***
-
-      // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 34 OF 123 ***
 
-      // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-
-      // *** DIAGRAM 35 OF 123 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 123 ***
-
-      // Wavefunction(s) for diagram number 36
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 37 OF 123 ***
-
-      // Wavefunction(s) for diagram number 37
-      // (none)
-
-      // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 38 OF 123 ***
-
-      // Wavefunction(s) for diagram number 38
-      // (none)
-
-      // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 39 OF 123 ***
-
-      // Wavefunction(s) for diagram number 39
-      // (none)
-
-      // Amplitude(s) for diagram number 39
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 40 OF 123 ***
-
-      // Wavefunction(s) for diagram number 40
-      // (none)
-
-      // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 41 OF 123 ***
-
-      // Wavefunction(s) for diagram number 41
-      // (none)
-
-      // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 42 OF 123 ***
-
-      // Wavefunction(s) for diagram number 42
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 43 OF 123 ***
-
-      // Wavefunction(s) for diagram number 43
-      // (none)
-
-      // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= amp_sv[0];
-
-      // *** DIAGRAM 44 OF 123 ***
-
-      // Wavefunction(s) for diagram number 44
-      // (none)
-
-      // Amplitude(s) for diagram number 44
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 45 OF 123 ***
-
-      // Wavefunction(s) for diagram number 45
-      // (none)
-
-      // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 46 OF 123 ***
-
-      // Wavefunction(s) for diagram number 46
-      // (none)
-
-      // Amplitude(s) for diagram number 46
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 47 OF 123 ***
-
-      // Wavefunction(s) for diagram number 47
-      // (none)
-
-      // Amplitude(s) for diagram number 47
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 48 OF 123 ***
-
-      // Wavefunction(s) for diagram number 48
-      // (none)
-
-      // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 49 OF 123 ***
-
-      // Wavefunction(s) for diagram number 49
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 50 OF 123 ***
-
-      // Wavefunction(s) for diagram number 50
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 51 OF 123 ***
-
-      // Wavefunction(s) for diagram number 51
-      // (none)
-
-      // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 52 OF 123 ***
-
-      // Wavefunction(s) for diagram number 52
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 53 OF 123 ***
-
-      // Wavefunction(s) for diagram number 53
-      // (none)
-
-      // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 54 OF 123 ***
-
-      // Wavefunction(s) for diagram number 54
-      // (none)
-
-      // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 55 OF 123 ***
-
-      // Wavefunction(s) for diagram number 55
-      // (none)
-
-      // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-
-      // *** DIAGRAM 56 OF 123 ***
-
-      // Wavefunction(s) for diagram number 56
-      // (none)
-
-      // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-
-      // *** DIAGRAM 57 OF 123 ***
-
-      // Wavefunction(s) for diagram number 57
-      // (none)
-
-      // Amplitude(s) for diagram number 57
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 58 OF 123 ***
-
-      // Wavefunction(s) for diagram number 58
-      // (none)
-
-      // Amplitude(s) for diagram number 58
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 59 OF 123 ***
-
-      // Wavefunction(s) for diagram number 59
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 59
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 60 OF 123 ***
-
-      // Wavefunction(s) for diagram number 60
-      // (none)
-
-      // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 61 OF 123 ***
-
-      // Wavefunction(s) for diagram number 61
-      // (none)
-
-      // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 62 OF 123 ***
-
-      // Wavefunction(s) for diagram number 62
-      // (none)
-
-      // Amplitude(s) for diagram number 62
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 63 OF 123 ***
-
-      // Wavefunction(s) for diagram number 63
-      // (none)
-
-      // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 64 OF 123 ***
-
-      // Wavefunction(s) for diagram number 64
-      // (none)
-
-      // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 65 OF 123 ***
-
-      // Wavefunction(s) for diagram number 65
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 66 OF 123 ***
-
-      // Wavefunction(s) for diagram number 66
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 67 OF 123 ***
-
-      // Wavefunction(s) for diagram number 67
-      // (none)
-
-      // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 68 OF 123 ***
-
-      // Wavefunction(s) for diagram number 68
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 69 OF 123 ***
-
-      // Wavefunction(s) for diagram number 69
-      // (none)
-
-      // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 70 OF 123 ***
-
-      // Wavefunction(s) for diagram number 70
-      // (none)
-
-      // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 71 OF 123 ***
-
-      // Wavefunction(s) for diagram number 71
-      // (none)
-
-      // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-
-      // *** DIAGRAM 72 OF 123 ***
-
-      // Wavefunction(s) for diagram number 72
-      // (none)
-
-      // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-
-      // *** DIAGRAM 73 OF 123 ***
-
-      // Wavefunction(s) for diagram number 73
-      // (none)
-
-      // Amplitude(s) for diagram number 73
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 74 OF 123 ***
-
-      // Wavefunction(s) for diagram number 74
-      // (none)
-
-      // Amplitude(s) for diagram number 74
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 75 OF 123 ***
-
-      // Wavefunction(s) for diagram number 75
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 75
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 76 OF 123 ***
-
-      // Wavefunction(s) for diagram number 76
-      // (none)
-
-      // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 77 OF 123 ***
-
-      // Wavefunction(s) for diagram number 77
-      // (none)
-
-      // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 78 OF 123 ***
-
-      // Wavefunction(s) for diagram number 78
-      // (none)
-
-      // Amplitude(s) for diagram number 78
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 79 OF 123 ***
-
-      // Wavefunction(s) for diagram number 79
-      // (none)
-
-      // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 80 OF 123 ***
-
-      // Wavefunction(s) for diagram number 80
-      // (none)
-
-      // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 81 OF 123 ***
-
-      // Wavefunction(s) for diagram number 81
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-
-      // *** DIAGRAM 82 OF 123 ***
-
-      // Wavefunction(s) for diagram number 82
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 83 OF 123 ***
-
-      // Wavefunction(s) for diagram number 83
-      // (none)
-
-      // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-
-      // *** DIAGRAM 84 OF 123 ***
-
-      // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-
-      // *** DIAGRAM 85 OF 123 ***
-
-      // Wavefunction(s) for diagram number 85
-      // (none)
-
-      // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 86 OF 123 ***
-
-      // Wavefunction(s) for diagram number 86
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 86
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 87 OF 123 ***
-
-      // Wavefunction(s) for diagram number 87
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 88 OF 123 ***
-
-      // Wavefunction(s) for diagram number 88
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 88
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-
-      // *** DIAGRAM 89 OF 123 ***
-
-      // Wavefunction(s) for diagram number 89
-      // (none)
-
-      // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 90 OF 123 ***
-
-      // Wavefunction(s) for diagram number 90
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
-
-      // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 91 OF 123 ***
-
-      // Wavefunction(s) for diagram number 91
-      // (none)
-
-      // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 92 OF 123 ***
-
-      // Wavefunction(s) for diagram number 92
-      // (none)
-
-      // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 93 OF 123 ***
-
-      // Wavefunction(s) for diagram number 93
-      // (none)
-
-      // Amplitude(s) for diagram number 93
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 94 OF 123 ***
-
-      // Wavefunction(s) for diagram number 94
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 94
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 95 OF 123 ***
-
-      // Wavefunction(s) for diagram number 95
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
-
-      // Amplitude(s) for diagram number 95
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 96 OF 123 ***
-
-      // Wavefunction(s) for diagram number 96
-      // (none)
-
-      // Amplitude(s) for diagram number 96
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 97 OF 123 ***
-
-      // Wavefunction(s) for diagram number 97
-      // (none)
-
-      // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 98 OF 123 ***
-
-      // Wavefunction(s) for diagram number 98
-      // (none)
-
-      // Amplitude(s) for diagram number 98
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 99 OF 123 ***
-
-      // Wavefunction(s) for diagram number 99
-      // (none)
-
-      // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 100 OF 123 ***
-
-      // Wavefunction(s) for diagram number 100
-      // (none)
-
-      // Amplitude(s) for diagram number 100
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 101 OF 123 ***
-
-      // Wavefunction(s) for diagram number 101
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 102 OF 123 ***
-
-      // Wavefunction(s) for diagram number 102
-      // (none)
-
-      // Amplitude(s) for diagram number 102
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 103 OF 123 ***
-
-      // Wavefunction(s) for diagram number 103
-      // (none)
-
-      // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 104 OF 123 ***
-
-      // Wavefunction(s) for diagram number 104
-      // (none)
-
-      // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 105 OF 123 ***
-
-      // Wavefunction(s) for diagram number 105
-      // (none)
-
-      // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 106 OF 123 ***
-
-      // Wavefunction(s) for diagram number 106
-      // (none)
-
-      // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 107 OF 123 ***
-
-      // Wavefunction(s) for diagram number 107
-      // (none)
-
-      // Amplitude(s) for diagram number 107
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 108 OF 123 ***
-
-      // Wavefunction(s) for diagram number 108
-      // (none)
-
-      // Amplitude(s) for diagram number 108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 109 OF 123 ***
-
-      // Wavefunction(s) for diagram number 109
-      // (none)
-
-      // Amplitude(s) for diagram number 109
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 110 OF 123 ***
-
-      // Wavefunction(s) for diagram number 110
-      // (none)
-
-      // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 111 OF 123 ***
-
-      // Wavefunction(s) for diagram number 111
-      // (none)
-
-      // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 112 OF 123 ***
-
-      // Wavefunction(s) for diagram number 112
-      // (none)
-
-      // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 113 OF 123 ***
-
-      // Wavefunction(s) for diagram number 113
-      // (none)
-
-      // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 114 OF 123 ***
-
-      // Wavefunction(s) for diagram number 114
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 114
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 115 OF 123 ***
-
-      // Wavefunction(s) for diagram number 115
-      // (none)
-
-      // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 116 OF 123 ***
-
-      // Wavefunction(s) for diagram number 116
-      // (none)
-
-      // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 117 OF 123 ***
-
-      // Wavefunction(s) for diagram number 117
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 117
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 118 OF 123 ***
-
-      // Wavefunction(s) for diagram number 118
-      // (none)
-
-      // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 119 OF 123 ***
-
-      // Wavefunction(s) for diagram number 119
-      // (none)
-
-      // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 120 OF 123 ***
-
-      // Wavefunction(s) for diagram number 120
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
-
-      // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 121 OF 123 ***
-
-      // Wavefunction(s) for diagram number 121
-      // (none)
-
-      // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 122 OF 123 ***
-
-      // Wavefunction(s) for diagram number 122
-      // (none)
-
-      // Amplitude(s) for diagram number 122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 123 OF 123 ***
-
-      // Wavefunction(s) for diagram number 123
-      // (none)
-
-      // Amplitude(s) for diagram number 123
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
-
-      // The color matrix (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
-        { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
-        { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
-        { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
-        { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
-        { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
-        { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
-        { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
-        { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
-        { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
-        { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
-        { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
-        { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
-        { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
-        { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
-        { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
-        { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
-        { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
-        { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
-        { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
-        { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
-        { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
-        { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
-        { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 123 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -2775,7 +738,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -2810,6 +777,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -2852,6 +823,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -2954,26 +929,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -2981,25 +956,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -3012,7 +1002,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -3022,26 +1012,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -3052,17 +1043,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -3099,35 +1096,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -3144,13 +1323,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -3162,17 +1335,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -3198,93 +1374,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3326,7 +1472,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -3349,7 +1495,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -3358,25 +1504,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -3386,8 +1538,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -3403,11 +1557,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -3509,14 +1664,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
index 2b75e0f842..f142e7ef7d 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 159;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..02db3d0204
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.cc
@@ -0,0 +1,405 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
+
+  // The color matrix (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
+    { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
+    { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
+    { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
+    { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
+    { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
+    { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
+    { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
+    { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
+    { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
+    { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
+    { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
+    { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
+    { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
+    { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
+    { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
+    { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
+    { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
+    { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
+    { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
+    { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
+    { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
+    { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
+    { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagrams.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagrams.h
new file mode 100644
index 0000000000..b857887951
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/diagrams.h
@@ -0,0 +1,4177 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 123 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 1
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 123 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 123 ***
+    // Wavefunction(s) for diagram number 3
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 123 ***
+    // Wavefunction(s) for diagram number 4
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 4
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 123 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 123 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 123 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 123 ***
+    // Wavefunction(s) for diagram number 8
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 123 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 123 ***
+    // Wavefunction(s) for diagram number 10
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 123 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 123 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 123 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 123 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 123 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 123 ***
+    // Wavefunction(s) for diagram number 16
+    // (none)
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 123 ***
+    // Wavefunction(s) for diagram number 17
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 123 ***
+    // Wavefunction(s) for diagram number 18
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 18
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 123 ***
+    // Wavefunction(s) for diagram number 19
+    // (none)
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 123 ***
+    // Wavefunction(s) for diagram number 20
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 123 ***
+    // Wavefunction(s) for diagram number 21
+    // (none)
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 123 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 123 ***
+    // Wavefunction(s) for diagram number 23
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
+    // Amplitude(s) for diagram number 23
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 123 ***
+    // Wavefunction(s) for diagram number 24
+    // (none)
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 123 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 123 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 123 ***
+    // Wavefunction(s) for diagram number 27
+    // (none)
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 123 ***
+    // Wavefunction(s) for diagram number 28
+    // (none)
+    // Amplitude(s) for diagram number 28
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 123 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 123 ***
+    // Wavefunction(s) for diagram number 30
+    // (none)
+    // Amplitude(s) for diagram number 30
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 123 ***
+    // Wavefunction(s) for diagram number 31
+    // (none)
+    // Amplitude(s) for diagram number 31
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 123 ***
+    // Wavefunction(s) for diagram number 32
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 123 ***
+    // Wavefunction(s) for diagram number 33
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 123 ***
+    // Wavefunction(s) for diagram number 34
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 34
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 123 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 123 ***
+    // Wavefunction(s) for diagram number 36
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 36
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram37( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 37 OF 123 ***
+    // Wavefunction(s) for diagram number 37
+    // (none)
+    // Amplitude(s) for diagram number 37
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram38( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 38 OF 123 ***
+    // Wavefunction(s) for diagram number 38
+    // (none)
+    // Amplitude(s) for diagram number 38
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram39( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 39 OF 123 ***
+    // Wavefunction(s) for diagram number 39
+    // (none)
+    // Amplitude(s) for diagram number 39
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram40( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 40 OF 123 ***
+    // Wavefunction(s) for diagram number 40
+    // (none)
+    // Amplitude(s) for diagram number 40
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram41( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 41 OF 123 ***
+    // Wavefunction(s) for diagram number 41
+    // (none)
+    // Amplitude(s) for diagram number 41
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram42( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 42 OF 123 ***
+    // Wavefunction(s) for diagram number 42
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 42
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram43( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 43 OF 123 ***
+    // Wavefunction(s) for diagram number 43
+    // (none)
+    // Amplitude(s) for diagram number 43
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram44( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 44 OF 123 ***
+    // Wavefunction(s) for diagram number 44
+    // (none)
+    // Amplitude(s) for diagram number 44
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram45( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 45 OF 123 ***
+    // Wavefunction(s) for diagram number 45
+    // (none)
+    // Amplitude(s) for diagram number 45
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram46( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 46 OF 123 ***
+    // Wavefunction(s) for diagram number 46
+    // (none)
+    // Amplitude(s) for diagram number 46
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram47( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 47 OF 123 ***
+    // Wavefunction(s) for diagram number 47
+    // (none)
+    // Amplitude(s) for diagram number 47
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram48( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 48 OF 123 ***
+    // Wavefunction(s) for diagram number 48
+    // (none)
+    // Amplitude(s) for diagram number 48
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram49( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 49 OF 123 ***
+    // Wavefunction(s) for diagram number 49
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 49
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram50( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 50 OF 123 ***
+    // Wavefunction(s) for diagram number 50
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 50
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram51( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 51 OF 123 ***
+    // Wavefunction(s) for diagram number 51
+    // (none)
+    // Amplitude(s) for diagram number 51
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram52( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 52 OF 123 ***
+    // Wavefunction(s) for diagram number 52
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 52
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram53( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 53 OF 123 ***
+    // Wavefunction(s) for diagram number 53
+    // (none)
+    // Amplitude(s) for diagram number 53
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram54( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 54 OF 123 ***
+    // Wavefunction(s) for diagram number 54
+    // (none)
+    // Amplitude(s) for diagram number 54
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram55( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 55 OF 123 ***
+    // Wavefunction(s) for diagram number 55
+    // (none)
+    // Amplitude(s) for diagram number 55
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram56( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 56 OF 123 ***
+    // Wavefunction(s) for diagram number 56
+    // (none)
+    // Amplitude(s) for diagram number 56
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram57( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 57 OF 123 ***
+    // Wavefunction(s) for diagram number 57
+    // (none)
+    // Amplitude(s) for diagram number 57
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram58( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 58 OF 123 ***
+    // Wavefunction(s) for diagram number 58
+    // (none)
+    // Amplitude(s) for diagram number 58
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram59( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 59 OF 123 ***
+    // Wavefunction(s) for diagram number 59
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 59
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram60( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 60 OF 123 ***
+    // Wavefunction(s) for diagram number 60
+    // (none)
+    // Amplitude(s) for diagram number 60
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram61( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 61 OF 123 ***
+    // Wavefunction(s) for diagram number 61
+    // (none)
+    // Amplitude(s) for diagram number 61
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram62( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 62 OF 123 ***
+    // Wavefunction(s) for diagram number 62
+    // (none)
+    // Amplitude(s) for diagram number 62
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram63( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 63 OF 123 ***
+    // Wavefunction(s) for diagram number 63
+    // (none)
+    // Amplitude(s) for diagram number 63
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram64( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 64 OF 123 ***
+    // Wavefunction(s) for diagram number 64
+    // (none)
+    // Amplitude(s) for diagram number 64
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram65( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 65 OF 123 ***
+    // Wavefunction(s) for diagram number 65
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 65
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram66( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 66 OF 123 ***
+    // Wavefunction(s) for diagram number 66
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 66
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram67( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 67 OF 123 ***
+    // Wavefunction(s) for diagram number 67
+    // (none)
+    // Amplitude(s) for diagram number 67
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram68( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 68 OF 123 ***
+    // Wavefunction(s) for diagram number 68
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 68
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram69( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 69 OF 123 ***
+    // Wavefunction(s) for diagram number 69
+    // (none)
+    // Amplitude(s) for diagram number 69
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram70( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 70 OF 123 ***
+    // Wavefunction(s) for diagram number 70
+    // (none)
+    // Amplitude(s) for diagram number 70
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram71( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 71 OF 123 ***
+    // Wavefunction(s) for diagram number 71
+    // (none)
+    // Amplitude(s) for diagram number 71
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram72( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 72 OF 123 ***
+    // Wavefunction(s) for diagram number 72
+    // (none)
+    // Amplitude(s) for diagram number 72
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram73( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 73 OF 123 ***
+    // Wavefunction(s) for diagram number 73
+    // (none)
+    // Amplitude(s) for diagram number 73
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram74( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 74 OF 123 ***
+    // Wavefunction(s) for diagram number 74
+    // (none)
+    // Amplitude(s) for diagram number 74
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram75( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 75 OF 123 ***
+    // Wavefunction(s) for diagram number 75
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 75
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram76( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 76 OF 123 ***
+    // Wavefunction(s) for diagram number 76
+    // (none)
+    // Amplitude(s) for diagram number 76
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram77( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 77 OF 123 ***
+    // Wavefunction(s) for diagram number 77
+    // (none)
+    // Amplitude(s) for diagram number 77
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram78( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 78 OF 123 ***
+    // Wavefunction(s) for diagram number 78
+    // (none)
+    // Amplitude(s) for diagram number 78
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram79( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 79 OF 123 ***
+    // Wavefunction(s) for diagram number 79
+    // (none)
+    // Amplitude(s) for diagram number 79
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram80( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 80 OF 123 ***
+    // Wavefunction(s) for diagram number 80
+    // (none)
+    // Amplitude(s) for diagram number 80
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram81( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 81 OF 123 ***
+    // Wavefunction(s) for diagram number 81
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 81
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram82( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 82 OF 123 ***
+    // Wavefunction(s) for diagram number 82
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 82
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram83( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 83 OF 123 ***
+    // Wavefunction(s) for diagram number 83
+    // (none)
+    // Amplitude(s) for diagram number 83
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram84( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 84 OF 123 ***
+    // Wavefunction(s) for diagram number 84
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 84
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram85( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 85 OF 123 ***
+    // Wavefunction(s) for diagram number 85
+    // (none)
+    // Amplitude(s) for diagram number 85
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram86( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 86 OF 123 ***
+    // Wavefunction(s) for diagram number 86
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 86
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram87( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 87 OF 123 ***
+    // Wavefunction(s) for diagram number 87
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 87
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram88( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 88 OF 123 ***
+    // Wavefunction(s) for diagram number 88
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 88
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram89( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 89 OF 123 ***
+    // Wavefunction(s) for diagram number 89
+    // (none)
+    // Amplitude(s) for diagram number 89
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram90( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 90 OF 123 ***
+    // Wavefunction(s) for diagram number 90
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
+    // Amplitude(s) for diagram number 90
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram91( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 91 OF 123 ***
+    // Wavefunction(s) for diagram number 91
+    // (none)
+    // Amplitude(s) for diagram number 91
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram92( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 92 OF 123 ***
+    // Wavefunction(s) for diagram number 92
+    // (none)
+    // Amplitude(s) for diagram number 92
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram93( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 93 OF 123 ***
+    // Wavefunction(s) for diagram number 93
+    // (none)
+    // Amplitude(s) for diagram number 93
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram94( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 94 OF 123 ***
+    // Wavefunction(s) for diagram number 94
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 94
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram95( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 95 OF 123 ***
+    // Wavefunction(s) for diagram number 95
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
+    // Amplitude(s) for diagram number 95
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram96( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 96 OF 123 ***
+    // Wavefunction(s) for diagram number 96
+    // (none)
+    // Amplitude(s) for diagram number 96
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram97( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 97 OF 123 ***
+    // Wavefunction(s) for diagram number 97
+    // (none)
+    // Amplitude(s) for diagram number 97
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram98( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 98 OF 123 ***
+    // Wavefunction(s) for diagram number 98
+    // (none)
+    // Amplitude(s) for diagram number 98
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram99( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 99 OF 123 ***
+    // Wavefunction(s) for diagram number 99
+    // (none)
+    // Amplitude(s) for diagram number 99
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram100( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 100 OF 123 ***
+    // Wavefunction(s) for diagram number 100
+    // (none)
+    // Amplitude(s) for diagram number 100
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram101( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 101 OF 123 ***
+    // Wavefunction(s) for diagram number 101
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 101
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram102( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 102 OF 123 ***
+    // Wavefunction(s) for diagram number 102
+    // (none)
+    // Amplitude(s) for diagram number 102
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram103( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 103 OF 123 ***
+    // Wavefunction(s) for diagram number 103
+    // (none)
+    // Amplitude(s) for diagram number 103
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram104( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 104 OF 123 ***
+    // Wavefunction(s) for diagram number 104
+    // (none)
+    // Amplitude(s) for diagram number 104
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram105( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 105 OF 123 ***
+    // Wavefunction(s) for diagram number 105
+    // (none)
+    // Amplitude(s) for diagram number 105
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram106( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 106 OF 123 ***
+    // Wavefunction(s) for diagram number 106
+    // (none)
+    // Amplitude(s) for diagram number 106
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram107( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 107 OF 123 ***
+    // Wavefunction(s) for diagram number 107
+    // (none)
+    // Amplitude(s) for diagram number 107
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram108( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 108 OF 123 ***
+    // Wavefunction(s) for diagram number 108
+    // (none)
+    // Amplitude(s) for diagram number 108
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram109( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 109 OF 123 ***
+    // Wavefunction(s) for diagram number 109
+    // (none)
+    // Amplitude(s) for diagram number 109
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram110( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 110 OF 123 ***
+    // Wavefunction(s) for diagram number 110
+    // (none)
+    // Amplitude(s) for diagram number 110
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram111( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 111 OF 123 ***
+    // Wavefunction(s) for diagram number 111
+    // (none)
+    // Amplitude(s) for diagram number 111
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram112( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 112 OF 123 ***
+    // Wavefunction(s) for diagram number 112
+    // (none)
+    // Amplitude(s) for diagram number 112
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram113( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 113 OF 123 ***
+    // Wavefunction(s) for diagram number 113
+    // (none)
+    // Amplitude(s) for diagram number 113
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram114( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 114 OF 123 ***
+    // Wavefunction(s) for diagram number 114
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 114
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram115( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 115 OF 123 ***
+    // Wavefunction(s) for diagram number 115
+    // (none)
+    // Amplitude(s) for diagram number 115
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram116( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 116 OF 123 ***
+    // Wavefunction(s) for diagram number 116
+    // (none)
+    // Amplitude(s) for diagram number 116
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram117( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 117 OF 123 ***
+    // Wavefunction(s) for diagram number 117
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 117
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram118( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 118 OF 123 ***
+    // Wavefunction(s) for diagram number 118
+    // (none)
+    // Amplitude(s) for diagram number 118
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram119( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 119 OF 123 ***
+    // Wavefunction(s) for diagram number 119
+    // (none)
+    // Amplitude(s) for diagram number 119
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram120( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 120 OF 123 ***
+    // Wavefunction(s) for diagram number 120
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+    // Amplitude(s) for diagram number 120
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram121( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 121 OF 123 ***
+    // Wavefunction(s) for diagram number 121
+    // (none)
+    // Amplitude(s) for diagram number 121
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram122( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 122 OF 123 ***
+    // Wavefunction(s) for diagram number 122
+    // (none)
+    // Amplitude(s) for diagram number 122
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram123( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 123 OF 123 ***
+    // Wavefunction(s) for diagram number 123
+    // (none)
+    // Amplitude(s) for diagram number 123
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
index 53dd560ed6..c30f753dcb 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -963,7 +963,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -976,7 +976,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -990,7 +990,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1003,7 +1003,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1017,7 +1017,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -1030,7 +1030,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1053,7 +1053,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1066,7 +1066,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1091,7 +1091,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1104,7 +1104,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1116,7 +1116,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1129,7 +1129,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1148,7 +1148,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1161,7 +1161,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1180,7 +1180,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1193,7 +1193,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1211,7 +1211,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1226,7 +1226,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
@@ -1241,7 +1241,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1256,7 +1256,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1276,7 +1276,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -1291,7 +1291,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1306,7 +1306,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1321,7 +1321,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1341,7 +1341,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1356,7 +1356,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1371,7 +1371,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1386,7 +1386,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index 1afa1ab2a5..50bf1bb4e8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005990028381347656 [0m
+[1;32mDEBUG: model prefixing  takes 0.005486965179443359 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,27 +151,27 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.890 s
+1 processes with 1240 diagrams generated in 1.870 s
 Total: 1 processes with 1240 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
+INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
@@ -179,25 +179,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.565 s
-Wrote files for 2281 helas calls in 18.614 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 945 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 4, 4: 5, 5: 7, 6: 8, 7: 14, 8: 15, 9: 16, 10: 18, 11: 19, 12: 20, 13: 22, 14: 23, 15: 24, 16: 26, 17: 27, 18: 28, 19: 29, 20: 30, 21: 31, 22: 33, 23: 34, 24: 35, 25: 36, 26: 37, 27: 38, 28: 39, 29: 40, 30: 41, 31: 42, 32: 43, 33: 44, 34: 45, 35: 46, 36: 47, 37: 49, 38: 50, 39: 51, 40: 52, 41: 53, 42: 54, 43: 55, 44: 56, 45: 57, 46: 58, 47: 59, 48: 60, 49: 61, 50: 62, 51: 63, 52: 65, 53: 66, 54: 67, 55: 68, 56: 69, 57: 70, 58: 71, 59: 72, 60: 73, 61: 74, 62: 75, 63: 76, 64: 77, 65: 78, 66: 79, 67: 81, 68: 82, 69: 83, 70: 84, 71: 85, 72: 86, 73: 87, 74: 88, 75: 89, 76: 91, 77: 92, 78: 93, 79: 94, 80: 95, 81: 96, 82: 97, 83: 98, 84: 99, 85: 101, 86: 102, 87: 103, 88: 104, 89: 105, 90: 106, 91: 107, 92: 108, 93: 109, 94: 110, 95: 111, 96: 112, 97: 113, 98: 114, 99: 115, 100: 116, 101: 117, 102: 118, 103: 119, 104: 120, 105: 121, 106: 124, 107: 125, 108: 126, 109: 127, 110: 128, 111: 129, 112: 130, 113: 131, 114: 132, 115: 133, 116: 134, 117: 135, 118: 136, 119: 137, 120: 138, 121: 140, 122: 141, 123: 143, 124: 144, 125: 145, 126: 146, 127: 147, 128: 148, 129: 149, 130: 150, 131: 151, 132: 152, 133: 153, 134: 154, 135: 155, 136: 156, 137: 157, 138: 159, 139: 160, 140: 161, 141: 162, 142: 163, 143: 164, 144: 165, 145: 166, 146: 167, 147: 168, 148: 169, 149: 170, 150: 171, 151: 172, 152: 173, 153: 175, 154: 176, 155: 177, 156: 178, 157: 179, 158: 180, 159: 181, 160: 182, 161: 183, 162: 184, 163: 185, 164: 186, 165: 187, 166: 188, 167: 189, 168: 190, 169: 191, 170: 192, 171: 193, 172: 194, 173: 195, 174: 196, 175: 197, 176: 198, 177: 199, 178: 200, 179: 201, 180: 202, 181: 203, 182: 204, 183: 205, 184: 206, 185: 207, 186: 208, 187: 209, 188: 210, 189: 211, 190: 212, 191: 213, 192: 214, 193: 215, 194: 216, 195: 217, 196: 218, 197: 220, 198: 221, 199: 222, 200: 223, 201: 224, 202: 225, 203: 227, 204: 228, 205: 229, 206: 230, 207: 231, 208: 232, 209: 234, 210: 235, 211: 247, 212: 248, 213: 249, 214: 250, 215: 251, 216: 252, 217: 253, 218: 254, 219: 255, 220: 256, 221: 257, 222: 258, 223: 259, 224: 260, 225: 261, 226: 263, 227: 264, 228: 266, 229: 267, 230: 268, 231: 269, 232: 270, 233: 271, 234: 272, 235: 273, 236: 274, 237: 275, 238: 276, 239: 277, 240: 278, 241: 279, 242: 280, 243: 282, 244: 283, 245: 284, 246: 285, 247: 286, 248: 287, 249: 288, 250: 289, 251: 290, 252: 291, 253: 292, 254: 293, 255: 294, 256: 295, 257: 296, 258: 298, 259: 299, 260: 300, 261: 301, 262: 302, 263: 303, 264: 304, 265: 305, 266: 306, 267: 307, 268: 308, 269: 309, 270: 310, 271: 311, 272: 312, 273: 313, 274: 314, 275: 315, 276: 316, 277: 317, 278: 318, 279: 319, 280: 320, 281: 321, 282: 322, 283: 323, 284: 324, 285: 325, 286: 326, 287: 327, 288: 328, 289: 329, 290: 330, 291: 331, 292: 332, 293: 333, 294: 334, 295: 335, 296: 336, 297: 337, 298: 338, 299: 339, 300: 340, 301: 341, 302: 343, 303: 344, 304: 345, 305: 346, 306: 347, 307: 348, 308: 350, 309: 351, 310: 352, 311: 353, 312: 354, 313: 355, 314: 357, 315: 358, 316: 370, 317: 371, 318: 372, 319: 373, 320: 374, 321: 375, 322: 377, 323: 378, 324: 379, 325: 380, 326: 381, 327: 382, 328: 383, 329: 384, 330: 385, 331: 386, 332: 387, 333: 388, 334: 389, 335: 390, 336: 391, 337: 393, 338: 394, 339: 395, 340: 396, 341: 397, 342: 398, 343: 399, 344: 400, 345: 401, 346: 402, 347: 403, 348: 404, 349: 405, 350: 406, 351: 407, 352: 409, 353: 410, 354: 411, 355: 412, 356: 413, 357: 414, 358: 415, 359: 416, 360: 417, 361: 418, 362: 419, 363: 420, 364: 421, 365: 422, 366: 423, 367: 425, 368: 426, 369: 427, 370: 428, 371: 429, 372: 430, 373: 431, 374: 432, 375: 433, 376: 434, 377: 435, 378: 437, 379: 438, 380: 440, 381: 441, 382: 447, 383: 448, 384: 449, 385: 450, 386: 451, 387: 452, 388: 453, 389: 454, 390: 455, 391: 457, 392: 458, 393: 459, 394: 460, 395: 461, 396: 462, 397: 463, 398: 464, 399: 465, 400: 467, 401: 468, 402: 469, 403: 470, 404: 471, 405: 472, 406: 473, 407: 474, 408: 475, 409: 477, 410: 478, 411: 479, 412: 480, 413: 481, 414: 482, 415: 484, 416: 485, 417: 486, 418: 487, 419: 488, 420: 489, 421: 493, 422: 494, 423: 495, 424: 496, 425: 497, 426: 498, 427: 500, 428: 501, 429: 502, 430: 503, 431: 504, 432: 505, 433: 506, 434: 507, 435: 508, 436: 509, 437: 510, 438: 511, 439: 512, 440: 513, 441: 514, 442: 516, 443: 517, 444: 518, 445: 519, 446: 520, 447: 521, 448: 522, 449: 523, 450: 524, 451: 525, 452: 526, 453: 527, 454: 528, 455: 529, 456: 530, 457: 532, 458: 533, 459: 534, 460: 535, 461: 536, 462: 537, 463: 538, 464: 539, 465: 540, 466: 541, 467: 542, 468: 543, 469: 544, 470: 545, 471: 546, 472: 548, 473: 549, 474: 550, 475: 551, 476: 552, 477: 553, 478: 554, 479: 555, 480: 556, 481: 557, 482: 558, 483: 560, 484: 561, 485: 563, 486: 564, 487: 570, 488: 571, 489: 572, 490: 573, 491: 574, 492: 575, 493: 576, 494: 577, 495: 578, 496: 580, 497: 581, 498: 582, 499: 583, 500: 584, 501: 585, 502: 586, 503: 587, 504: 588, 505: 590, 506: 591, 507: 592, 508: 593, 509: 594, 510: 595, 511: 596, 512: 597, 513: 598, 514: 600, 515: 601, 516: 602, 517: 603, 518: 604, 519: 605, 520: 607, 521: 608, 522: 609, 523: 610, 524: 611, 525: 612, 526: 616, 527: 617, 528: 618, 529: 619, 530: 620, 531: 621, 532: 623, 533: 624, 534: 625, 535: 626, 536: 627, 537: 628, 538: 629, 539: 630, 540: 631, 541: 632, 542: 633, 543: 634, 544: 635, 545: 636, 546: 637, 547: 639, 548: 640, 549: 641, 550: 642, 551: 643, 552: 644, 553: 645, 554: 646, 555: 647, 556: 648, 557: 649, 558: 650, 559: 651, 560: 652, 561: 653, 562: 655, 563: 656, 564: 657, 565: 658, 566: 659, 567: 660, 568: 661, 569: 662, 570: 663, 571: 664, 572: 665, 573: 666, 574: 667, 575: 668, 576: 669, 577: 671, 578: 672, 579: 673, 580: 674, 581: 675, 582: 676, 583: 677, 584: 678, 585: 679, 586: 680, 587: 681, 588: 683, 589: 684, 590: 686, 591: 687, 592: 693, 593: 694, 594: 695, 595: 696, 596: 697, 597: 698, 598: 699, 599: 700, 600: 701, 601: 703, 602: 704, 603: 705, 604: 706, 605: 707, 606: 708, 607: 709, 608: 710, 609: 711, 610: 713, 611: 714, 612: 715, 613: 716, 614: 717, 615: 718, 616: 719, 617: 720, 618: 721, 619: 723, 620: 724, 621: 725, 622: 726, 623: 727, 624: 728, 625: 730, 626: 731, 627: 732, 628: 733, 629: 734, 630: 735, 631: 739, 632: 740, 633: 741, 634: 742, 635: 743, 636: 744, 637: 745, 638: 746, 639: 747, 640: 748, 641: 749, 642: 750, 643: 751, 644: 752, 645: 753, 646: 754, 647: 755, 648: 756, 649: 757, 650: 758, 651: 759, 652: 760, 653: 761, 654: 762, 655: 763, 656: 764, 657: 765, 658: 766, 659: 767, 660: 768, 661: 769, 662: 770, 663: 771, 664: 773, 665: 774, 666: 775, 667: 776, 668: 777, 669: 778, 670: 780, 671: 781, 672: 782, 673: 783, 674: 784, 675: 785, 676: 789, 677: 790, 678: 791, 679: 792, 680: 793, 681: 794, 682: 795, 683: 796, 684: 797, 685: 798, 686: 799, 687: 800, 688: 801, 689: 802, 690: 803, 691: 804, 692: 805, 693: 806, 694: 807, 695: 808, 696: 809, 697: 810, 698: 811, 699: 812, 700: 813, 701: 814, 702: 815, 703: 816, 704: 817, 705: 818, 706: 819, 707: 820, 708: 821, 709: 823, 710: 824, 711: 825, 712: 826, 713: 827, 714: 828, 715: 830, 716: 831, 717: 832, 718: 833, 719: 834, 720: 835, 721: 839, 722: 840, 723: 842, 724: 843, 725: 845, 726: 846, 727: 852, 728: 853, 729: 854, 730: 855, 731: 856, 732: 857, 733: 858, 734: 859, 735: 860, 736: 862, 737: 863, 738: 864, 739: 865, 740: 866, 741: 867, 742: 868, 743: 869, 744: 870, 745: 872, 746: 873, 747: 874, 748: 875, 749: 876, 750: 877, 751: 878, 752: 879, 753: 880, 754: 882, 755: 883, 756: 884, 757: 885, 758: 886, 759: 887, 760: 889, 761: 890, 762: 891, 763: 892, 764: 893, 765: 894, 766: 895, 767: 896, 768: 898, 769: 899, 770: 901, 771: 902, 772: 908, 773: 909, 774: 910, 775: 911, 776: 912, 777: 913, 778: 914, 779: 915, 780: 916, 781: 918, 782: 919, 783: 920, 784: 921, 785: 922, 786: 923, 787: 924, 788: 925, 789: 926, 790: 928, 791: 929, 792: 930, 793: 931, 794: 932, 795: 933, 796: 934, 797: 935, 798: 936, 799: 938, 800: 939, 801: 940, 802: 941, 803: 942, 804: 943, 805: 945, 806: 946, 807: 947, 808: 948, 809: 949, 810: 950, 811: 951, 812: 952, 813: 954, 814: 955, 815: 957, 816: 958, 817: 964, 818: 965, 819: 966, 820: 967, 821: 968, 822: 969, 823: 970, 824: 971, 825: 972, 826: 974, 827: 975, 828: 976, 829: 977, 830: 978, 831: 979, 832: 980, 833: 981, 834: 982, 835: 984, 836: 985, 837: 986, 838: 987, 839: 988, 840: 989, 841: 990, 842: 991, 843: 992, 844: 994, 845: 995, 846: 996, 847: 997, 848: 998, 849: 999, 850: 1001, 851: 1002, 852: 1003, 853: 1004, 854: 1005, 855: 1006, 856: 1007, 857: 1008, 858: 1010, 859: 1011, 860: 1013, 861: 1014, 862: 1019, 863: 1020, 864: 1022, 865: 1023, 866: 1025, 867: 1026, 868: 1031, 869: 1032, 870: 1034, 871: 1035, 872: 1037, 873: 1038, 874: 1046, 875: 1047, 876: 1048, 877: 1049, 878: 1050, 879: 1051, 880: 1052, 881: 1053, 882: 1054, 883: 1055, 884: 1056, 885: 1057, 886: 1058, 887: 1059, 888: 1060, 889: 1061, 890: 1062, 891: 1063, 892: 1065, 893: 1066, 894: 1067, 895: 1068, 896: 1069, 897: 1070, 898: 1071, 899: 1072, 900: 1073, 901: 1074, 902: 1075, 903: 1076, 904: 1077, 905: 1078, 906: 1079, 907: 1080, 908: 1081, 909: 1082, 910: 1084, 911: 1085, 912: 1086, 913: 1087, 914: 1088, 915: 1089, 916: 1090, 917: 1091, 918: 1092, 919: 1093, 920: 1094, 921: 1095, 922: 1096, 923: 1097, 924: 1098, 925: 1099, 926: 1100, 927: 1101, 928: 1103, 929: 1104, 930: 1105, 931: 1106, 932: 1107, 933: 1108, 934: 1110, 935: 1111, 936: 1112, 937: 1113, 938: 1114, 939: 1115, 940: 1117, 941: 1118, 942: 1119, 943: 1120, 944: 1121, 945: 1122} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 4: 3, 5: 4, 7: 5, 8: 6, 14: 7, 15: 8, 16: 9, 18: 10, 19: 11, 20: 12, 22: 13, 23: 14, 24: 15, 26: 16, 27: 17, 28: 18, 29: 19, 30: 20, 31: 21, 33: 22, 34: 23, 35: 24, 36: 25, 37: 26, 38: 27, 39: 28, 40: 29, 41: 30, 42: 31, 43: 32, 44: 33, 45: 34, 46: 35, 47: 36, 49: 37, 50: 38, 51: 39, 52: 40, 53: 41, 54: 42, 55: 43, 56: 44, 57: 45, 58: 46, 59: 47, 60: 48, 61: 49, 62: 50, 63: 51, 65: 52, 66: 53, 67: 54, 68: 55, 69: 56, 70: 57, 71: 58, 72: 59, 73: 60, 74: 61, 75: 62, 76: 63, 77: 64, 78: 65, 79: 66, 81: 67, 82: 68, 83: 69, 84: 70, 85: 71, 86: 72, 87: 73, 88: 74, 89: 75, 91: 76, 92: 77, 93: 78, 94: 79, 95: 80, 96: 81, 97: 82, 98: 83, 99: 84, 101: 85, 102: 86, 103: 87, 104: 88, 105: 89, 106: 90, 107: 91, 108: 92, 109: 93, 110: 94, 111: 95, 112: 96, 113: 97, 114: 98, 115: 99, 116: 100, 117: 101, 118: 102, 119: 103, 120: 104, 121: 105, 124: 106, 125: 107, 126: 108, 127: 109, 128: 110, 129: 111, 130: 112, 131: 113, 132: 114, 133: 115, 134: 116, 135: 117, 136: 118, 137: 119, 138: 120, 140: 121, 141: 122, 143: 123, 144: 124, 145: 125, 146: 126, 147: 127, 148: 128, 149: 129, 150: 130, 151: 131, 152: 132, 153: 133, 154: 134, 155: 135, 156: 136, 157: 137, 159: 138, 160: 139, 161: 140, 162: 141, 163: 142, 164: 143, 165: 144, 166: 145, 167: 146, 168: 147, 169: 148, 170: 149, 171: 150, 172: 151, 173: 152, 175: 153, 176: 154, 177: 155, 178: 156, 179: 157, 180: 158, 181: 159, 182: 160, 183: 161, 184: 162, 185: 163, 186: 164, 187: 165, 188: 166, 189: 167, 190: 168, 191: 169, 192: 170, 193: 171, 194: 172, 195: 173, 196: 174, 197: 175, 198: 176, 199: 177, 200: 178, 201: 179, 202: 180, 203: 181, 204: 182, 205: 183, 206: 184, 207: 185, 208: 186, 209: 187, 210: 188, 211: 189, 212: 190, 213: 191, 214: 192, 215: 193, 216: 194, 217: 195, 218: 196, 220: 197, 221: 198, 222: 199, 223: 200, 224: 201, 225: 202, 227: 203, 228: 204, 229: 205, 230: 206, 231: 207, 232: 208, 234: 209, 235: 210, 247: 211, 248: 212, 249: 213, 250: 214, 251: 215, 252: 216, 253: 217, 254: 218, 255: 219, 256: 220, 257: 221, 258: 222, 259: 223, 260: 224, 261: 225, 263: 226, 264: 227, 266: 228, 267: 229, 268: 230, 269: 231, 270: 232, 271: 233, 272: 234, 273: 235, 274: 236, 275: 237, 276: 238, 277: 239, 278: 240, 279: 241, 280: 242, 282: 243, 283: 244, 284: 245, 285: 246, 286: 247, 287: 248, 288: 249, 289: 250, 290: 251, 291: 252, 292: 253, 293: 254, 294: 255, 295: 256, 296: 257, 298: 258, 299: 259, 300: 260, 301: 261, 302: 262, 303: 263, 304: 264, 305: 265, 306: 266, 307: 267, 308: 268, 309: 269, 310: 270, 311: 271, 312: 272, 313: 273, 314: 274, 315: 275, 316: 276, 317: 277, 318: 278, 319: 279, 320: 280, 321: 281, 322: 282, 323: 283, 324: 284, 325: 285, 326: 286, 327: 287, 328: 288, 329: 289, 330: 290, 331: 291, 332: 292, 333: 293, 334: 294, 335: 295, 336: 296, 337: 297, 338: 298, 339: 299, 340: 300, 341: 301, 343: 302, 344: 303, 345: 304, 346: 305, 347: 306, 348: 307, 350: 308, 351: 309, 352: 310, 353: 311, 354: 312, 355: 313, 357: 314, 358: 315, 370: 316, 371: 317, 372: 318, 373: 319, 374: 320, 375: 321, 377: 322, 378: 323, 379: 324, 380: 325, 381: 326, 382: 327, 383: 328, 384: 329, 385: 330, 386: 331, 387: 332, 388: 333, 389: 334, 390: 335, 391: 336, 393: 337, 394: 338, 395: 339, 396: 340, 397: 341, 398: 342, 399: 343, 400: 344, 401: 345, 402: 346, 403: 347, 404: 348, 405: 349, 406: 350, 407: 351, 409: 352, 410: 353, 411: 354, 412: 355, 413: 356, 414: 357, 415: 358, 416: 359, 417: 360, 418: 361, 419: 362, 420: 363, 421: 364, 422: 365, 423: 366, 425: 367, 426: 368, 427: 369, 428: 370, 429: 371, 430: 372, 431: 373, 432: 374, 433: 375, 434: 376, 435: 377, 437: 378, 438: 379, 440: 380, 441: 381, 447: 382, 448: 383, 449: 384, 450: 385, 451: 386, 452: 387, 453: 388, 454: 389, 455: 390, 457: 391, 458: 392, 459: 393, 460: 394, 461: 395, 462: 396, 463: 397, 464: 398, 465: 399, 467: 400, 468: 401, 469: 402, 470: 403, 471: 404, 472: 405, 473: 406, 474: 407, 475: 408, 477: 409, 478: 410, 479: 411, 480: 412, 481: 413, 482: 414, 484: 415, 485: 416, 486: 417, 487: 418, 488: 419, 489: 420, 493: 421, 494: 422, 495: 423, 496: 424, 497: 425, 498: 426, 500: 427, 501: 428, 502: 429, 503: 430, 504: 431, 505: 432, 506: 433, 507: 434, 508: 435, 509: 436, 510: 437, 511: 438, 512: 439, 513: 440, 514: 441, 516: 442, 517: 443, 518: 444, 519: 445, 520: 446, 521: 447, 522: 448, 523: 449, 524: 450, 525: 451, 526: 452, 527: 453, 528: 454, 529: 455, 530: 456, 532: 457, 533: 458, 534: 459, 535: 460, 536: 461, 537: 462, 538: 463, 539: 464, 540: 465, 541: 466, 542: 467, 543: 468, 544: 469, 545: 470, 546: 471, 548: 472, 549: 473, 550: 474, 551: 475, 552: 476, 553: 477, 554: 478, 555: 479, 556: 480, 557: 481, 558: 482, 560: 483, 561: 484, 563: 485, 564: 486, 570: 487, 571: 488, 572: 489, 573: 490, 574: 491, 575: 492, 576: 493, 577: 494, 578: 495, 580: 496, 581: 497, 582: 498, 583: 499, 584: 500, 585: 501, 586: 502, 587: 503, 588: 504, 590: 505, 591: 506, 592: 507, 593: 508, 594: 509, 595: 510, 596: 511, 597: 512, 598: 513, 600: 514, 601: 515, 602: 516, 603: 517, 604: 518, 605: 519, 607: 520, 608: 521, 609: 522, 610: 523, 611: 524, 612: 525, 616: 526, 617: 527, 618: 528, 619: 529, 620: 530, 621: 531, 623: 532, 624: 533, 625: 534, 626: 535, 627: 536, 628: 537, 629: 538, 630: 539, 631: 540, 632: 541, 633: 542, 634: 543, 635: 544, 636: 545, 637: 546, 639: 547, 640: 548, 641: 549, 642: 550, 643: 551, 644: 552, 645: 553, 646: 554, 647: 555, 648: 556, 649: 557, 650: 558, 651: 559, 652: 560, 653: 561, 655: 562, 656: 563, 657: 564, 658: 565, 659: 566, 660: 567, 661: 568, 662: 569, 663: 570, 664: 571, 665: 572, 666: 573, 667: 574, 668: 575, 669: 576, 671: 577, 672: 578, 673: 579, 674: 580, 675: 581, 676: 582, 677: 583, 678: 584, 679: 585, 680: 586, 681: 587, 683: 588, 684: 589, 686: 590, 687: 591, 693: 592, 694: 593, 695: 594, 696: 595, 697: 596, 698: 597, 699: 598, 700: 599, 701: 600, 703: 601, 704: 602, 705: 603, 706: 604, 707: 605, 708: 606, 709: 607, 710: 608, 711: 609, 713: 610, 714: 611, 715: 612, 716: 613, 717: 614, 718: 615, 719: 616, 720: 617, 721: 618, 723: 619, 724: 620, 725: 621, 726: 622, 727: 623, 728: 624, 730: 625, 731: 626, 732: 627, 733: 628, 734: 629, 735: 630, 739: 631, 740: 632, 741: 633, 742: 634, 743: 635, 744: 636, 745: 637, 746: 638, 747: 639, 748: 640, 749: 641, 750: 642, 751: 643, 752: 644, 753: 645, 754: 646, 755: 647, 756: 648, 757: 649, 758: 650, 759: 651, 760: 652, 761: 653, 762: 654, 763: 655, 764: 656, 765: 657, 766: 658, 767: 659, 768: 660, 769: 661, 770: 662, 771: 663, 773: 664, 774: 665, 775: 666, 776: 667, 777: 668, 778: 669, 780: 670, 781: 671, 782: 672, 783: 673, 784: 674, 785: 675, 789: 676, 790: 677, 791: 678, 792: 679, 793: 680, 794: 681, 795: 682, 796: 683, 797: 684, 798: 685, 799: 686, 800: 687, 801: 688, 802: 689, 803: 690, 804: 691, 805: 692, 806: 693, 807: 694, 808: 695, 809: 696, 810: 697, 811: 698, 812: 699, 813: 700, 814: 701, 815: 702, 816: 703, 817: 704, 818: 705, 819: 706, 820: 707, 821: 708, 823: 709, 824: 710, 825: 711, 826: 712, 827: 713, 828: 714, 830: 715, 831: 716, 832: 717, 833: 718, 834: 719, 835: 720, 839: 721, 840: 722, 842: 723, 843: 724, 845: 725, 846: 726, 852: 727, 853: 728, 854: 729, 855: 730, 856: 731, 857: 732, 858: 733, 859: 734, 860: 735, 862: 736, 863: 737, 864: 738, 865: 739, 866: 740, 867: 741, 868: 742, 869: 743, 870: 744, 872: 745, 873: 746, 874: 747, 875: 748, 876: 749, 877: 750, 878: 751, 879: 752, 880: 753, 882: 754, 883: 755, 884: 756, 885: 757, 886: 758, 887: 759, 889: 760, 890: 761, 891: 762, 892: 763, 893: 764, 894: 765, 895: 766, 896: 767, 898: 768, 899: 769, 901: 770, 902: 771, 908: 772, 909: 773, 910: 774, 911: 775, 912: 776, 913: 777, 914: 778, 915: 779, 916: 780, 918: 781, 919: 782, 920: 783, 921: 784, 922: 785, 923: 786, 924: 787, 925: 788, 926: 789, 928: 790, 929: 791, 930: 792, 931: 793, 932: 794, 933: 795, 934: 796, 935: 797, 936: 798, 938: 799, 939: 800, 940: 801, 941: 802, 942: 803, 943: 804, 945: 805, 946: 806, 947: 807, 948: 808, 949: 809, 950: 810, 951: 811, 952: 812, 954: 813, 955: 814, 957: 815, 958: 816, 964: 817, 965: 818, 966: 819, 967: 820, 968: 821, 969: 822, 970: 823, 971: 824, 972: 825, 974: 826, 975: 827, 976: 828, 977: 829, 978: 830, 979: 831, 980: 832, 981: 833, 982: 834, 984: 835, 985: 836, 986: 837, 987: 838, 988: 839, 989: 840, 990: 841, 991: 842, 992: 843, 994: 844, 995: 845, 996: 846, 997: 847, 998: 848, 999: 849, 1001: 850, 1002: 851, 1003: 852, 1004: 853, 1005: 854, 1006: 855, 1007: 856, 1008: 857, 1010: 858, 1011: 859, 1013: 860, 1014: 861, 1019: 862, 1020: 863, 1022: 864, 1023: 865, 1025: 866, 1026: 867, 1031: 868, 1032: 869, 1034: 870, 1035: 871, 1037: 872, 1038: 873, 1046: 874, 1047: 875, 1048: 876, 1049: 877, 1050: 878, 1051: 879, 1052: 880, 1053: 881, 1054: 882, 1055: 883, 1056: 884, 1057: 885, 1058: 886, 1059: 887, 1060: 888, 1061: 889, 1062: 890, 1063: 891, 1065: 892, 1066: 893, 1067: 894, 1068: 895, 1069: 896, 1070: 897, 1071: 898, 1072: 899, 1073: 900, 1074: 901, 1075: 902, 1076: 903, 1077: 904, 1078: 905, 1079: 906, 1080: 907, 1081: 908, 1082: 909, 1084: 910, 1085: 911, 1086: 912, 1087: 913, 1088: 914, 1089: 915, 1090: 916, 1091: 917, 1092: 918, 1093: 919, 1094: 920, 1095: 921, 1096: 922, 1097: 923, 1098: 924, 1099: 925, 1100: 926, 1101: 927, 1103: 928, 1104: 929, 1105: 930, 1106: 931, 1107: 932, 1108: 933, 1110: 934, 1111: 935, 1112: 936, 1113: 937, 1114: 938, 1115: 939, 1117: 940, 1118: 941, 1119: 942, 1120: 943, 1121: 944, 1122: 945} [1;30m[model_handling.py at line 1665][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.505 s
+Wrote files for 2281 helas calls in 17.781 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.373 s
+ALOHA: aloha creates 5 routines in  0.313 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.313 s
+ALOHA: aloha creates 10 routines in  0.356 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -210,38 +210,38 @@ ALOHA: aloha creates 10 routines in  0.313 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 339 (offset 112 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m33.065s
-user	0m32.263s
+real	0m32.103s
+user	0m31.529s
 sys	0m0.459s
-Code generation completed in 33 seconds
+Code generation completed in 32 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -254,7 +254,7 @@ Code generation completed in 33 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -262,9 +262,9 @@ Code generation completed in 33 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -284,7 +284,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -292,9 +292,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index cdd9d43b05..e6a9fc4dae 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
index a08f93d92b..596243d42e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
index 48050a5fd7..377d5bc1c7 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/makefile b/epochX/cudacpp/gg_ttggg.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/makefile
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
index ba06f6ff44..6b4b8dc8ce 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 120;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,30013 +279,2613 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 2 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 3 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 3
-      // (none)
-
-      // Amplitude(s) for diagram number 3
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 4 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] );
-
-      // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 5 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 6 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 7 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 7
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 7
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 8 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 9 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 10 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
-
-      // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 11 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 11
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] );
-
-      // Amplitude(s) for diagram number 11
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 12 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 12
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 13 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 13
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 13
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 14 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 14
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] );
-
-      // Amplitude(s) for diagram number 14
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 15 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 15
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] );
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 16
-      // (none)
-
-      // Amplitude(s) for diagram number 16
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 17 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 17
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] );
-
-      // Amplitude(s) for diagram number 17
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 18 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 19 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 19
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] );
-
-      // Amplitude(s) for diagram number 19
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 20 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 21 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 21
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
-
-      // Amplitude(s) for diagram number 21
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 22 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 23 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 23
-      // (none)
-
-      // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 24 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 24
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] );
-
-      // Amplitude(s) for diagram number 24
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 25 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 25
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] );
-
-      // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 26 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 27
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 28
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] );
-
-      // Amplitude(s) for diagram number 28
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 30 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 30
-      // (none)
-
-      // Amplitude(s) for diagram number 30
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 31
-      // (none)
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-
-      // *** DIAGRAM 32 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 32
-      // (none)
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
-
-      // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-
-      // *** DIAGRAM 36 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 36
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] );
-
-      // Amplitude(s) for diagram number 36
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 37 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 37
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] );
-
-      // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 38 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 38
-      // (none)
-
-      // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-
-      // *** DIAGRAM 39 OF 1240 ***
 
-      // Wavefunction(s) for diagram number 39
-      // (none)
-
-      // Amplitude(s) for diagram number 39
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 40 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 40
-      // (none)
-
-      // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 41 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 41
-      // (none)
-
-      // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 42 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 42
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] );
-
-      // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 43 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 43
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] );
-
-      // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 44 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 44
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] );
-
-      // Amplitude(s) for diagram number 44
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 45 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 45
-      // (none)
-
-      // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 46 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 46
-      // (none)
-
-      // Amplitude(s) for diagram number 46
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 47 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 47
-      // (none)
-
-      // Amplitude(s) for diagram number 47
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 48 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 48
-      // (none)
-
-      // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 49 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 49
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
-
-      // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 50 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 50
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
-
-      // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 51 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 51
-      // (none)
-
-      // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-
-      // *** DIAGRAM 52 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 52
-      // (none)
-
-      // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 53 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 53
-      // (none)
-
-      // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 54 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 54
-      // (none)
-
-      // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 55 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 55
-      // (none)
-
-      // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 56 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 56
-      // (none)
-
-      // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 57 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 57
-      // (none)
-
-      // Amplitude(s) for diagram number 57
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 58 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 58
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] );
-
-      // Amplitude(s) for diagram number 58
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 59 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 59
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] );
-
-      // Amplitude(s) for diagram number 59
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 60 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 60
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] );
-
-      // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 61 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 61
-      // (none)
-
-      // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 62 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 62
-      // (none)
-
-      // Amplitude(s) for diagram number 62
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 63 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 63
-      // (none)
-
-      // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 64 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 64
-      // (none)
-
-      // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 65 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 65
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
-
-      // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 66 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 66
-      // (none)
-
-      // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 67 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 67
-      // (none)
-
-      // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 68 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 68
-      // (none)
-
-      // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 69 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 69
-      // (none)
-
-      // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 70 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 70
-      // (none)
-
-      // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 71 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 71
-      // (none)
-
-      // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 72 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 72
-      // (none)
-
-      // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 73 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 73
-      // (none)
-
-      // Amplitude(s) for diagram number 73
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 74 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 74
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
-
-      // Amplitude(s) for diagram number 74
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 75 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 75
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] );
-
-      // Amplitude(s) for diagram number 75
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 76 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 76
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] );
-
-      // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 77 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 77
-      // (none)
-
-      // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 78 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 78
-      // (none)
-
-      // Amplitude(s) for diagram number 78
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 79 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 79
-      // (none)
-
-      // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 80 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 80
-      // (none)
-
-      // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 81 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 81
-      // (none)
-
-      // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 82 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 82
-      // (none)
-
-      // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 83 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 83
-      // (none)
-
-      // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 84 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] );
-
-      // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 85 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 85
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
-
-      // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 86 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 86
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 86
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 87 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 87
-      // (none)
-
-      // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 88 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 88
-      // (none)
-
-      // Amplitude(s) for diagram number 88
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 89 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 89
-      // (none)
-
-      // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-
-      // *** DIAGRAM 90 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 90
-      // (none)
-
-      // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 91 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 91
-      // (none)
-
-      // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-
-      // *** DIAGRAM 92 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 92
-      // (none)
-
-      // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 93 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 93
-      // (none)
-
-      // Amplitude(s) for diagram number 93
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 94 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 94
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] );
-
-      // Amplitude(s) for diagram number 94
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 95 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 95
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
-
-      // Amplitude(s) for diagram number 95
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 96 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 96
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] );
-
-      // Amplitude(s) for diagram number 96
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 97 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 97
-      // (none)
-
-      // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-
-      // *** DIAGRAM 98 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 98
-      // (none)
-
-      // Amplitude(s) for diagram number 98
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 99 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 99
-      // (none)
-
-      // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-
-      // *** DIAGRAM 100 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 100
-      // (none)
-
-      // Amplitude(s) for diagram number 100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 101 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 101
-      // (none)
-
-      // Amplitude(s) for diagram number 101
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-
-      // *** DIAGRAM 102 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 102
-      // (none)
-
-      // Amplitude(s) for diagram number 102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 103 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 103
-      // (none)
-
-      // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 104 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 104
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] );
-
-      // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 105 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 105
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] );
-
-      // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 106 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 106
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-
-      // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 107 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 107
-      // (none)
-
-      // Amplitude(s) for diagram number 107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 108 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 108
-      // (none)
-
-      // Amplitude(s) for diagram number 108
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 109 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 109
-      // (none)
-
-      // Amplitude(s) for diagram number 109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 110 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 110
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-
-      // *** DIAGRAM 111 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 111
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 112 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 112
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
-
-      // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 113 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 113
-      // (none)
-
-      // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 114 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 114
-      // (none)
-
-      // Amplitude(s) for diagram number 114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 115 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 115
-      // (none)
-
-      // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 116 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 116
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-
-      // *** DIAGRAM 117 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 117
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] );
-
-      // Amplitude(s) for diagram number 117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 118 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 118
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] );
-
-      // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 119 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 119
-      // (none)
-
-      // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 120 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 120
-      // (none)
-
-      // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 121 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 121
-      // (none)
-
-      // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 122 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 122
-      // (none)
-
-      // Amplitude(s) for diagram number 122
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 123 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 123
-      // (none)
-
-      // Amplitude(s) for diagram number 123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 124 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 124
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 124
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 125 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 125
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 125
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] -= amp_sv[0];
-
-      // *** DIAGRAM 126 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 126
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] );
-
-      // Amplitude(s) for diagram number 126
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 127 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 127
-      // (none)
-
-      // Amplitude(s) for diagram number 127
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] -= amp_sv[0];
-
-      // *** DIAGRAM 128 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 128
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] );
-
-      // Amplitude(s) for diagram number 128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 129 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 129
-      // (none)
-
-      // Amplitude(s) for diagram number 129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 130 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 130
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] );
-
-      // Amplitude(s) for diagram number 130
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 131 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 131
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
-
-      // Amplitude(s) for diagram number 131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 132 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 132
-      // (none)
-
-      // Amplitude(s) for diagram number 132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 133 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 133
-      // (none)
-
-      // Amplitude(s) for diagram number 133
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 134 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-
-      // Amplitude(s) for diagram number 134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 135 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 135
-      // (none)
-
-      // Amplitude(s) for diagram number 135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 136 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 136
-      // (none)
-
-      // Amplitude(s) for diagram number 136
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 137 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 137
-      // (none)
-
-      // Amplitude(s) for diagram number 137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 138 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 138
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
-
-      // Amplitude(s) for diagram number 138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 139 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 139
-      // (none)
-
-      // Amplitude(s) for diagram number 139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 140 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 140
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] );
-
-      // Amplitude(s) for diagram number 140
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 141 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] );
-
-      // Amplitude(s) for diagram number 141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 142 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 142
-      // (none)
-
-      // Amplitude(s) for diagram number 142
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 143 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 143
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] );
-
-      // Amplitude(s) for diagram number 143
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 144 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 144
-      // (none)
-
-      // Amplitude(s) for diagram number 144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 145 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 145
-      // (none)
-
-      // Amplitude(s) for diagram number 145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 146 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 146
-      // (none)
-
-      // Amplitude(s) for diagram number 146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 147 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 147
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
-
-      // Amplitude(s) for diagram number 147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 148 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 148
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] );
-
-      // Amplitude(s) for diagram number 148
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 149 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 149
-      // (none)
-
-      // Amplitude(s) for diagram number 149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 150 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 150
-      // (none)
-
-      // Amplitude(s) for diagram number 150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 151 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 151
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 151
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 152 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 152
-      // (none)
-
-      // Amplitude(s) for diagram number 152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 153 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 153
-      // (none)
-
-      // Amplitude(s) for diagram number 153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-
-      // *** DIAGRAM 154 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 154
-      // (none)
-
-      // Amplitude(s) for diagram number 154
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 155 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 155
-      // (none)
-
-      // Amplitude(s) for diagram number 155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 156 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 156
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] );
-
-      // Amplitude(s) for diagram number 156
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 157 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] );
-
-      // Amplitude(s) for diagram number 157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 158 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 158
-      // (none)
-
-      // Amplitude(s) for diagram number 158
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 159 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 159
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-
-      // Amplitude(s) for diagram number 159
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 160 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 160
-      // (none)
-
-      // Amplitude(s) for diagram number 160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 161 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 161
-      // (none)
-
-      // Amplitude(s) for diagram number 161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 162 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 162
-      // (none)
-
-      // Amplitude(s) for diagram number 162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 163 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 163
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
-
-      // Amplitude(s) for diagram number 163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 164 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 164
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] );
-
-      // Amplitude(s) for diagram number 164
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 165 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 165
-      // (none)
-
-      // Amplitude(s) for diagram number 165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 166 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 166
-      // (none)
-
-      // Amplitude(s) for diagram number 166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 167 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 167
-      // (none)
-
-      // Amplitude(s) for diagram number 167
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 168 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 168
-      // (none)
-
-      // Amplitude(s) for diagram number 168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 169 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 169
-      // (none)
-
-      // Amplitude(s) for diagram number 169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-
-      // *** DIAGRAM 170 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 170
-      // (none)
-
-      // Amplitude(s) for diagram number 170
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 171 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 171
-      // (none)
-
-      // Amplitude(s) for diagram number 171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-
-      // *** DIAGRAM 172 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 172
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] );
-
-      // Amplitude(s) for diagram number 172
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 173 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] );
-
-      // Amplitude(s) for diagram number 173
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 174 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 174
-      // (none)
-
-      // Amplitude(s) for diagram number 174
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 175 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 175
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] );
-
-      // Amplitude(s) for diagram number 175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 176 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 176
-      // (none)
-
-      // Amplitude(s) for diagram number 176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 177 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 177
-      // (none)
-
-      // Amplitude(s) for diagram number 177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 178 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 178
-      // (none)
-
-      // Amplitude(s) for diagram number 178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 179 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 179
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-
-      // Amplitude(s) for diagram number 179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 180 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 180
-      // (none)
-
-      // Amplitude(s) for diagram number 180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 181 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 181
-      // (none)
-
-      // Amplitude(s) for diagram number 181
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 182 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 182
-      // (none)
-
-      // Amplitude(s) for diagram number 182
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 183 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 183
-      // (none)
-
-      // Amplitude(s) for diagram number 183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 184 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 184
-      // (none)
-
-      // Amplitude(s) for diagram number 184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 185 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 185
-      // (none)
-
-      // Amplitude(s) for diagram number 185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-
-      // *** DIAGRAM 186 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 186
-      // (none)
-
-      // Amplitude(s) for diagram number 186
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 187 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 187
-      // (none)
-
-      // Amplitude(s) for diagram number 187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-
-      // *** DIAGRAM 188 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 188
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-
-      // Amplitude(s) for diagram number 188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 189 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 189
-      // (none)
-
-      // Amplitude(s) for diagram number 189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 190 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 190
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] );
-
-      // Amplitude(s) for diagram number 190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 191 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 191
-      // (none)
-
-      // Amplitude(s) for diagram number 191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] -= amp_sv[0];
-
-      // *** DIAGRAM 192 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 192
-      // (none)
-
-      // Amplitude(s) for diagram number 192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 193 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 193
-      // (none)
-
-      // Amplitude(s) for diagram number 193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 194 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 194
-      // (none)
-
-      // Amplitude(s) for diagram number 194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 195 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 195
-      // (none)
-
-      // Amplitude(s) for diagram number 195
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 196 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 196
-      // (none)
-
-      // Amplitude(s) for diagram number 196
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 197 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 197
-      // (none)
-
-      // Amplitude(s) for diagram number 197
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 198 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 198
-      // (none)
-
-      // Amplitude(s) for diagram number 198
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 199 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 199
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
-
-      // Amplitude(s) for diagram number 199
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 200 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 200
-      // (none)
-
-      // Amplitude(s) for diagram number 200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= amp_sv[0];
-
-      // *** DIAGRAM 201 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 201
-      // (none)
-
-      // Amplitude(s) for diagram number 201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 202 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 202
-      // (none)
-
-      // Amplitude(s) for diagram number 202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 203 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 203
-      // (none)
-
-      // Amplitude(s) for diagram number 203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 204 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 204
-      // (none)
-
-      // Amplitude(s) for diagram number 204
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 205 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 205
-      // (none)
-
-      // Amplitude(s) for diagram number 205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 206 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 206
-      // (none)
-
-      // Amplitude(s) for diagram number 206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 207 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 207
-      // (none)
-
-      // Amplitude(s) for diagram number 207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 208 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 208
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-
-      // Amplitude(s) for diagram number 208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= amp_sv[0];
-
-      // *** DIAGRAM 209 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 209
-      // (none)
-
-      // Amplitude(s) for diagram number 209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= amp_sv[0];
-
-      // *** DIAGRAM 210 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 210
-      // (none)
-
-      // Amplitude(s) for diagram number 210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 211 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 211
-      // (none)
-
-      // Amplitude(s) for diagram number 211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 212 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 212
-      // (none)
-
-      // Amplitude(s) for diagram number 212
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 213 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 213
-      // (none)
-
-      // Amplitude(s) for diagram number 213
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 214 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 214
-      // (none)
-
-      // Amplitude(s) for diagram number 214
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 215 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 215
-      // (none)
-
-      // Amplitude(s) for diagram number 215
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 216 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 216
-      // (none)
-
-      // Amplitude(s) for diagram number 216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 217 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 217
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] );
-
-      // Amplitude(s) for diagram number 217
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 218 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 218
-      // (none)
-
-      // Amplitude(s) for diagram number 218
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 219 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 219
-      // (none)
-
-      // Amplitude(s) for diagram number 219
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 220 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 220
-      // (none)
-
-      // Amplitude(s) for diagram number 220
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 221 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 221
-      // (none)
-
-      // Amplitude(s) for diagram number 221
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 222 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 222
-      // (none)
-
-      // Amplitude(s) for diagram number 222
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 223 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 223
-      // (none)
-
-      // Amplitude(s) for diagram number 223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 224 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 224
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 224
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 225 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 225
-      // (none)
-
-      // Amplitude(s) for diagram number 225
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 226 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 226
-      // (none)
-
-      // Amplitude(s) for diagram number 226
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 227 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 227
-      // (none)
-
-      // Amplitude(s) for diagram number 227
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 228 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 228
-      // (none)
-
-      // Amplitude(s) for diagram number 228
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 229 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 229
-      // (none)
-
-      // Amplitude(s) for diagram number 229
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 230 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 230
-      // (none)
-
-      // Amplitude(s) for diagram number 230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 231 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 231
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] );
-
-      // Amplitude(s) for diagram number 231
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 232 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 232
-      // (none)
-
-      // Amplitude(s) for diagram number 232
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 233 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 233
-      // (none)
-
-      // Amplitude(s) for diagram number 233
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 234 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 234
-      // (none)
-
-      // Amplitude(s) for diagram number 234
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 235 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 235
-      // (none)
-
-      // Amplitude(s) for diagram number 235
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 236 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 236
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] );
-
-      // Amplitude(s) for diagram number 236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 237 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 237
-      // (none)
-
-      // Amplitude(s) for diagram number 237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 238 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 238
-      // (none)
-
-      // Amplitude(s) for diagram number 238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 239 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 239
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] );
-
-      // Amplitude(s) for diagram number 239
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 240 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 240
-      // (none)
-
-      // Amplitude(s) for diagram number 240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 241 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 241
-      // (none)
-
-      // Amplitude(s) for diagram number 241
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 242 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 242
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] );
-
-      // Amplitude(s) for diagram number 242
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 243 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 243
-      // (none)
-
-      // Amplitude(s) for diagram number 243
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 244 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 244
-      // (none)
-
-      // Amplitude(s) for diagram number 244
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 245 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 245
-      // (none)
-
-      // Amplitude(s) for diagram number 245
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 246 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 246
-      // (none)
-
-      // Amplitude(s) for diagram number 246
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 247 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 247
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 247
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[41] -= amp_sv[0];
-
-      // *** DIAGRAM 248 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 248
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] );
-
-      // Amplitude(s) for diagram number 248
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[47] -= amp_sv[0];
-
-      // *** DIAGRAM 249 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 249
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] );
-
-      // Amplitude(s) for diagram number 249
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] -= amp_sv[0];
-
-      // *** DIAGRAM 250 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 250
-      // (none)
-
-      // Amplitude(s) for diagram number 250
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[45] -= amp_sv[0];
-
-      // *** DIAGRAM 251 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 251
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
-
-      // Amplitude(s) for diagram number 251
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] -= amp_sv[0];
-
-      // *** DIAGRAM 252 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 252
-      // (none)
-
-      // Amplitude(s) for diagram number 252
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[39] -= amp_sv[0];
-
-      // *** DIAGRAM 253 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 253
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] );
-
-      // Amplitude(s) for diagram number 253
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 254 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 254
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
-
-      // Amplitude(s) for diagram number 254
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 255 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 255
-      // (none)
-
-      // Amplitude(s) for diagram number 255
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 256 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 256
-      // (none)
-
-      // Amplitude(s) for diagram number 256
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-
-      // *** DIAGRAM 257 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 257
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
-
-      // Amplitude(s) for diagram number 257
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 258 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 258
-      // (none)
-
-      // Amplitude(s) for diagram number 258
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 259 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 259
-      // (none)
-
-      // Amplitude(s) for diagram number 259
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 260 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 260
-      // (none)
-
-      // Amplitude(s) for diagram number 260
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 261 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 261
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
-
-      // Amplitude(s) for diagram number 261
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 262 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 262
-      // (none)
-
-      // Amplitude(s) for diagram number 262
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-
-      // *** DIAGRAM 263 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 263
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] );
-
-      // Amplitude(s) for diagram number 263
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 264 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 264
-      // (none)
-
-      // Amplitude(s) for diagram number 264
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 265 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 265
-      // (none)
-
-      // Amplitude(s) for diagram number 265
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 266 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 266
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] );
-
-      // Amplitude(s) for diagram number 266
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 267 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 267
-      // (none)
-
-      // Amplitude(s) for diagram number 267
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 268 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 268
-      // (none)
-
-      // Amplitude(s) for diagram number 268
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 269 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 269
-      // (none)
-
-      // Amplitude(s) for diagram number 269
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 270 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 270
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
-
-      // Amplitude(s) for diagram number 270
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 271 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 271
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] );
-
-      // Amplitude(s) for diagram number 271
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 272 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 272
-      // (none)
-
-      // Amplitude(s) for diagram number 272
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 273 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 273
-      // (none)
-
-      // Amplitude(s) for diagram number 273
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 274 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 274
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] );
-
-      // Amplitude(s) for diagram number 274
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 275 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 275
-      // (none)
-
-      // Amplitude(s) for diagram number 275
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 276 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 276
-      // (none)
-
-      // Amplitude(s) for diagram number 276
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 277 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 277
-      // (none)
-
-      // Amplitude(s) for diagram number 277
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 278 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 278
-      // (none)
-
-      // Amplitude(s) for diagram number 278
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 279 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 279
-      // (none)
-
-      // Amplitude(s) for diagram number 279
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 280 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 280
-      // (none)
-
-      // Amplitude(s) for diagram number 280
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 281 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 281
-      // (none)
-
-      // Amplitude(s) for diagram number 281
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 282 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 282
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
-
-      // Amplitude(s) for diagram number 282
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 283 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 283
-      // (none)
-
-      // Amplitude(s) for diagram number 283
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 284 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 284
-      // (none)
-
-      // Amplitude(s) for diagram number 284
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 285 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 285
-      // (none)
-
-      // Amplitude(s) for diagram number 285
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-
-      // *** DIAGRAM 286 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 286
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
-
-      // Amplitude(s) for diagram number 286
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 287 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 287
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] );
-
-      // Amplitude(s) for diagram number 287
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 288 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 288
-      // (none)
-
-      // Amplitude(s) for diagram number 288
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 289 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 289
-      // (none)
-
-      // Amplitude(s) for diagram number 289
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 290 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 290
-      // (none)
-
-      // Amplitude(s) for diagram number 290
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 291 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 291
-      // (none)
-
-      // Amplitude(s) for diagram number 291
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 292 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 292
-      // (none)
-
-      // Amplitude(s) for diagram number 292
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 293 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 293
-      // (none)
-
-      // Amplitude(s) for diagram number 293
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 294 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 294
-      // (none)
-
-      // Amplitude(s) for diagram number 294
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 295 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 295
-      // (none)
-
-      // Amplitude(s) for diagram number 295
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 296 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 296
-      // (none)
-
-      // Amplitude(s) for diagram number 296
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 297 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 297
-      // (none)
-
-      // Amplitude(s) for diagram number 297
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 298 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 298
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
-
-      // Amplitude(s) for diagram number 298
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 299 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 299
-      // (none)
-
-      // Amplitude(s) for diagram number 299
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 300 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 300
-      // (none)
-
-      // Amplitude(s) for diagram number 300
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 301 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 301
-      // (none)
-
-      // Amplitude(s) for diagram number 301
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-
-      // *** DIAGRAM 302 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 302
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 302
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 303 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 303
-      // (none)
-
-      // Amplitude(s) for diagram number 303
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-
-      // *** DIAGRAM 304 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 304
-      // (none)
-
-      // Amplitude(s) for diagram number 304
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 305 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 305
-      // (none)
-
-      // Amplitude(s) for diagram number 305
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 306 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 306
-      // (none)
-
-      // Amplitude(s) for diagram number 306
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-
-      // *** DIAGRAM 307 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 307
-      // (none)
-
-      // Amplitude(s) for diagram number 307
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 308 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 308
-      // (none)
-
-      // Amplitude(s) for diagram number 308
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 309 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 309
-      // (none)
-
-      // Amplitude(s) for diagram number 309
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 310 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 310
-      // (none)
-
-      // Amplitude(s) for diagram number 310
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 311 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 311
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 311
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[65] -= amp_sv[0];
-
-      // *** DIAGRAM 312 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 312
-      // (none)
-
-      // Amplitude(s) for diagram number 312
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[71] -= amp_sv[0];
-
-      // *** DIAGRAM 313 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 313
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] );
-
-      // Amplitude(s) for diagram number 313
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[59] -= amp_sv[0];
-
-      // *** DIAGRAM 314 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 314
-      // (none)
-
-      // Amplitude(s) for diagram number 314
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[69] -= amp_sv[0];
-
-      // *** DIAGRAM 315 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 315
-      // (none)
-
-      // Amplitude(s) for diagram number 315
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[57] -= amp_sv[0];
-
-      // *** DIAGRAM 316 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 316
-      // (none)
-
-      // Amplitude(s) for diagram number 316
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[63] -= amp_sv[0];
-
-      // *** DIAGRAM 317 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 317
-      // (none)
-
-      // Amplitude(s) for diagram number 317
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 318 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 318
-      // (none)
-
-      // Amplitude(s) for diagram number 318
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 319 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 319
-      // (none)
-
-      // Amplitude(s) for diagram number 319
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 320 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 320
-      // (none)
-
-      // Amplitude(s) for diagram number 320
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[89] -= amp_sv[0];
-
-      // *** DIAGRAM 321 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 321
-      // (none)
-
-      // Amplitude(s) for diagram number 321
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[95] -= amp_sv[0];
-
-      // *** DIAGRAM 322 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 322
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
-
-      // Amplitude(s) for diagram number 322
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[83] -= amp_sv[0];
-
-      // *** DIAGRAM 323 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 323
-      // (none)
-
-      // Amplitude(s) for diagram number 323
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[93] -= amp_sv[0];
-
-      // *** DIAGRAM 324 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 324
-      // (none)
-
-      // Amplitude(s) for diagram number 324
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[81] -= amp_sv[0];
-
-      // *** DIAGRAM 325 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 325
-      // (none)
-
-      // Amplitude(s) for diagram number 325
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[87] -= amp_sv[0];
-
-      // *** DIAGRAM 326 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 326
-      // (none)
-
-      // Amplitude(s) for diagram number 326
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 327 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 327
-      // (none)
-
-      // Amplitude(s) for diagram number 327
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 328 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 328
-      // (none)
-
-      // Amplitude(s) for diagram number 328
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 329 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 329
-      // (none)
-
-      // Amplitude(s) for diagram number 329
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 330 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 330
-      // (none)
-
-      // Amplitude(s) for diagram number 330
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 331 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 331
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
-
-      // Amplitude(s) for diagram number 331
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 332 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 332
-      // (none)
-
-      // Amplitude(s) for diagram number 332
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 333 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 333
-      // (none)
-
-      // Amplitude(s) for diagram number 333
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[105] -= amp_sv[0];
-
-      // *** DIAGRAM 334 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 334
-      // (none)
-
-      // Amplitude(s) for diagram number 334
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 335 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 335
-      // (none)
-
-      // Amplitude(s) for diagram number 335
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 336 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 336
-      // (none)
-
-      // Amplitude(s) for diagram number 336
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 337 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 337
-      // (none)
-
-      // Amplitude(s) for diagram number 337
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 338 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 338
-      // (none)
-
-      // Amplitude(s) for diagram number 338
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 339 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 339
-      // (none)
-
-      // Amplitude(s) for diagram number 339
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 340 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 340
-      // (none)
-
-      // Amplitude(s) for diagram number 340
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 341 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 341
-      // (none)
-
-      // Amplitude(s) for diagram number 341
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 342 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 342
-      // (none)
-
-      // Amplitude(s) for diagram number 342
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 343 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 343
-      // (none)
-
-      // Amplitude(s) for diagram number 343
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 344 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 344
-      // (none)
-
-      // Amplitude(s) for diagram number 344
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 345 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 345
-      // (none)
-
-      // Amplitude(s) for diagram number 345
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 346 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 346
-      // (none)
-
-      // Amplitude(s) for diagram number 346
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 347 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 347
-      // (none)
-
-      // Amplitude(s) for diagram number 347
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 348 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 348
-      // (none)
-
-      // Amplitude(s) for diagram number 348
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 349 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 349
-      // (none)
-
-      // Amplitude(s) for diagram number 349
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 350 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 350
-      // (none)
-
-      // Amplitude(s) for diagram number 350
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 351 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 351
-      // (none)
-
-      // Amplitude(s) for diagram number 351
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 352 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 352
-      // (none)
-
-      // Amplitude(s) for diagram number 352
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 353 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 353
-      // (none)
-
-      // Amplitude(s) for diagram number 353
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 354 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 354
-      // (none)
-
-      // Amplitude(s) for diagram number 354
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 355 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 355
-      // (none)
-
-      // Amplitude(s) for diagram number 355
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 356 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 356
-      // (none)
-
-      // Amplitude(s) for diagram number 356
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 357 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 357
-      // (none)
-
-      // Amplitude(s) for diagram number 357
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 358 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 358
-      // (none)
-
-      // Amplitude(s) for diagram number 358
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 359 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 359
-      // (none)
-
-      // Amplitude(s) for diagram number 359
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 360 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 360
-      // (none)
-
-      // Amplitude(s) for diagram number 360
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-
-      // *** DIAGRAM 361 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 361
-      // (none)
-
-      // Amplitude(s) for diagram number 361
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 362 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 362
-      // (none)
-
-      // Amplitude(s) for diagram number 362
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 363 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 363
-      // (none)
-
-      // Amplitude(s) for diagram number 363
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 364 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 364
-      // (none)
-
-      // Amplitude(s) for diagram number 364
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-
-      // *** DIAGRAM 365 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 365
-      // (none)
-
-      // Amplitude(s) for diagram number 365
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 366 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 366
-      // (none)
-
-      // Amplitude(s) for diagram number 366
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 367 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 367
-      // (none)
-
-      // Amplitude(s) for diagram number 367
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-
-      // *** DIAGRAM 368 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 368
-      // (none)
-
-      // Amplitude(s) for diagram number 368
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 369 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 369
-      // (none)
-
-      // Amplitude(s) for diagram number 369
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 370 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 370
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 370
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 371 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 371
-      // (none)
-
-      // Amplitude(s) for diagram number 371
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 372 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 372
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] );
-
-      // Amplitude(s) for diagram number 372
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 373 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 373
-      // (none)
-
-      // Amplitude(s) for diagram number 373
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 374 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 374
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 374
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 375 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 375
-      // (none)
-
-      // Amplitude(s) for diagram number 375
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-
-      // *** DIAGRAM 376 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 376
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
-
-      // Amplitude(s) for diagram number 376
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 377 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 377
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] );
-
-      // Amplitude(s) for diagram number 377
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 378 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 378
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 378
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 379 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 379
-      // (none)
-
-      // Amplitude(s) for diagram number 379
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-
-      // *** DIAGRAM 380 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 380
-      // (none)
-
-      // Amplitude(s) for diagram number 380
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 381 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 381
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] );
-
-      // Amplitude(s) for diagram number 381
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 382 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 382
-      // (none)
-
-      // Amplitude(s) for diagram number 382
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-
-      // *** DIAGRAM 383 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 383
-      // (none)
-
-      // Amplitude(s) for diagram number 383
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-
-      // *** DIAGRAM 384 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 384
-      // (none)
-
-      // Amplitude(s) for diagram number 384
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 385 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 385
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] );
-
-      // Amplitude(s) for diagram number 385
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 386 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 386
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
-
-      // Amplitude(s) for diagram number 386
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 387 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 387
-      // (none)
-
-      // Amplitude(s) for diagram number 387
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 388 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 388
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] );
-
-      // Amplitude(s) for diagram number 388
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 389 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 389
-      // (none)
-
-      // Amplitude(s) for diagram number 389
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 390 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 390
-      // (none)
-
-      // Amplitude(s) for diagram number 390
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 391 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 391
-      // (none)
-
-      // Amplitude(s) for diagram number 391
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 392 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 392
-      // (none)
-
-      // Amplitude(s) for diagram number 392
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 393 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 393
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
-
-      // Amplitude(s) for diagram number 393
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 394 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 394
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] );
-
-      // Amplitude(s) for diagram number 394
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 395 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 395
-      // (none)
-
-      // Amplitude(s) for diagram number 395
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-
-      // *** DIAGRAM 396 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 396
-      // (none)
-
-      // Amplitude(s) for diagram number 396
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 397 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 397
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
-
-      // Amplitude(s) for diagram number 397
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 398 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 398
-      // (none)
-
-      // Amplitude(s) for diagram number 398
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 399 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 399
-      // (none)
-
-      // Amplitude(s) for diagram number 399
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 400 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 400
-      // (none)
-
-      // Amplitude(s) for diagram number 400
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 401 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 401
-      // (none)
-
-      // Amplitude(s) for diagram number 401
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 402 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 402
-      // (none)
-
-      // Amplitude(s) for diagram number 402
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 403 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 403
-      // (none)
-
-      // Amplitude(s) for diagram number 403
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 404 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 404
-      // (none)
-
-      // Amplitude(s) for diagram number 404
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 405 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 405
-      // (none)
-
-      // Amplitude(s) for diagram number 405
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 406 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 406
-      // (none)
-
-      // Amplitude(s) for diagram number 406
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 407 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 407
-      // (none)
-
-      // Amplitude(s) for diagram number 407
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 408 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 408
-      // (none)
-
-      // Amplitude(s) for diagram number 408
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 409 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 409
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 409
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 410 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 410
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] );
-
-      // Amplitude(s) for diagram number 410
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 411 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 411
-      // (none)
-
-      // Amplitude(s) for diagram number 411
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 412 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 412
-      // (none)
-
-      // Amplitude(s) for diagram number 412
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 413 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 413
-      // (none)
-
-      // Amplitude(s) for diagram number 413
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 414 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 414
-      // (none)
-
-      // Amplitude(s) for diagram number 414
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 415 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 415
-      // (none)
-
-      // Amplitude(s) for diagram number 415
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 416 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 416
-      // (none)
-
-      // Amplitude(s) for diagram number 416
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-
-      // *** DIAGRAM 417 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 417
-      // (none)
-
-      // Amplitude(s) for diagram number 417
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-
-      // *** DIAGRAM 418 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 418
-      // (none)
-
-      // Amplitude(s) for diagram number 418
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-
-      // *** DIAGRAM 419 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 419
-      // (none)
-
-      // Amplitude(s) for diagram number 419
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 420 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 420
-      // (none)
-
-      // Amplitude(s) for diagram number 420
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 421 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 421
-      // (none)
-
-      // Amplitude(s) for diagram number 421
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 422 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 422
-      // (none)
-
-      // Amplitude(s) for diagram number 422
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 423 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 423
-      // (none)
-
-      // Amplitude(s) for diagram number 423
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 424 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 424
-      // (none)
-
-      // Amplitude(s) for diagram number 424
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 425 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 425
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 425
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-
-      // *** DIAGRAM 426 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 426
-      // (none)
-
-      // Amplitude(s) for diagram number 426
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 427 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 427
-      // (none)
-
-      // Amplitude(s) for diagram number 427
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 428 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 428
-      // (none)
-
-      // Amplitude(s) for diagram number 428
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 429 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 429
-      // (none)
-
-      // Amplitude(s) for diagram number 429
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 430 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 430
-      // (none)
-
-      // Amplitude(s) for diagram number 430
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-
-      // *** DIAGRAM 431 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 431
-      // (none)
-
-      // Amplitude(s) for diagram number 431
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 432 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 432
-      // (none)
-
-      // Amplitude(s) for diagram number 432
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-
-      // *** DIAGRAM 433 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 433
-      // (none)
-
-      // Amplitude(s) for diagram number 433
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-
-      // *** DIAGRAM 434 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 434
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 434
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 435 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 435
-      // (none)
-
-      // Amplitude(s) for diagram number 435
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 436 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 436
-      // (none)
-
-      // Amplitude(s) for diagram number 436
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 437 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 437
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] );
-
-      // Amplitude(s) for diagram number 437
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 438 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 438
-      // (none)
-
-      // Amplitude(s) for diagram number 438
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 439 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 439
-      // (none)
-
-      // Amplitude(s) for diagram number 439
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 440 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 440
-      // (none)
-
-      // Amplitude(s) for diagram number 440
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 441 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 441
-      // (none)
-
-      // Amplitude(s) for diagram number 441
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 442 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 442
-      // (none)
-
-      // Amplitude(s) for diagram number 442
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 443 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 443
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 443
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 444 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 444
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] );
-
-      // Amplitude(s) for diagram number 444
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 445 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 445
-      // (none)
-
-      // Amplitude(s) for diagram number 445
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 446 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 446
-      // (none)
-
-      // Amplitude(s) for diagram number 446
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 447 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 447
-      // (none)
-
-      // Amplitude(s) for diagram number 447
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 448 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 448
-      // (none)
-
-      // Amplitude(s) for diagram number 448
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 449 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 449
-      // (none)
-
-      // Amplitude(s) for diagram number 449
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 450 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 450
-      // (none)
-
-      // Amplitude(s) for diagram number 450
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 451 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 451
-      // (none)
-
-      // Amplitude(s) for diagram number 451
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-
-      // *** DIAGRAM 452 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 452
-      // (none)
-
-      // Amplitude(s) for diagram number 452
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 453 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 453
-      // (none)
-
-      // Amplitude(s) for diagram number 453
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 454 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 454
-      // (none)
-
-      // Amplitude(s) for diagram number 454
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 455 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 455
-      // (none)
-
-      // Amplitude(s) for diagram number 455
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 456 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 456
-      // (none)
-
-      // Amplitude(s) for diagram number 456
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 457 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 457
-      // (none)
-
-      // Amplitude(s) for diagram number 457
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-
-      // *** DIAGRAM 458 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 458
-      // (none)
-
-      // Amplitude(s) for diagram number 458
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 459 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 459
-      // (none)
-
-      // Amplitude(s) for diagram number 459
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 460 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 460
-      // (none)
-
-      // Amplitude(s) for diagram number 460
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 461 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 461
-      // (none)
-
-      // Amplitude(s) for diagram number 461
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 462 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 462
-      // (none)
-
-      // Amplitude(s) for diagram number 462
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 463 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 463
-      // (none)
-
-      // Amplitude(s) for diagram number 463
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 464 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 464
-      // (none)
-
-      // Amplitude(s) for diagram number 464
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 465 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 465
-      // (none)
-
-      // Amplitude(s) for diagram number 465
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 466 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 466
-      // (none)
-
-      // Amplitude(s) for diagram number 466
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 467 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 467
-      // (none)
-
-      // Amplitude(s) for diagram number 467
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 468 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 468
-      // (none)
-
-      // Amplitude(s) for diagram number 468
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 469 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 469
-      // (none)
-
-      // Amplitude(s) for diagram number 469
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 470 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 470
-      // (none)
-
-      // Amplitude(s) for diagram number 470
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 471 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 471
-      // (none)
-
-      // Amplitude(s) for diagram number 471
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-
-      // *** DIAGRAM 472 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 472
-      // (none)
-
-      // Amplitude(s) for diagram number 472
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 473 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 473
-      // (none)
-
-      // Amplitude(s) for diagram number 473
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 474 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 474
-      // (none)
-
-      // Amplitude(s) for diagram number 474
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 475 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 475
-      // (none)
-
-      // Amplitude(s) for diagram number 475
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 476 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 476
-      // (none)
-
-      // Amplitude(s) for diagram number 476
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 477 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 477
-      // (none)
-
-      // Amplitude(s) for diagram number 477
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 478 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 478
-      // (none)
-
-      // Amplitude(s) for diagram number 478
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-
-      // *** DIAGRAM 479 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 479
-      // (none)
-
-      // Amplitude(s) for diagram number 479
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 480 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 480
-      // (none)
-
-      // Amplitude(s) for diagram number 480
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 481 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 481
-      // (none)
-
-      // Amplitude(s) for diagram number 481
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-
-      // *** DIAGRAM 482 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 482
-      // (none)
-
-      // Amplitude(s) for diagram number 482
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 483 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 483
-      // (none)
-
-      // Amplitude(s) for diagram number 483
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 484 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 484
-      // (none)
-
-      // Amplitude(s) for diagram number 484
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 485 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 485
-      // (none)
-
-      // Amplitude(s) for diagram number 485
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 486 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 486
-      // (none)
-
-      // Amplitude(s) for diagram number 486
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 487 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 487
-      // (none)
-
-      // Amplitude(s) for diagram number 487
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-
-      // *** DIAGRAM 488 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 488
-      // (none)
-
-      // Amplitude(s) for diagram number 488
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 489 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 489
-      // (none)
-
-      // Amplitude(s) for diagram number 489
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 490 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 490
-      // (none)
-
-      // Amplitude(s) for diagram number 490
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 491 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 491
-      // (none)
-
-      // Amplitude(s) for diagram number 491
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 492 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 492
-      // (none)
-
-      // Amplitude(s) for diagram number 492
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 493 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 493
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 493
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 494 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 494
-      // (none)
-
-      // Amplitude(s) for diagram number 494
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 495 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 495
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] );
-
-      // Amplitude(s) for diagram number 495
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 496 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 496
-      // (none)
-
-      // Amplitude(s) for diagram number 496
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-
-      // *** DIAGRAM 497 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 497
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 497
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 498 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 498
-      // (none)
-
-      // Amplitude(s) for diagram number 498
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-
-      // *** DIAGRAM 499 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 499
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] );
-
-      // Amplitude(s) for diagram number 499
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 500 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 500
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
-
-      // Amplitude(s) for diagram number 500
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 501 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 501
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
-
-      // Amplitude(s) for diagram number 501
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 502 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 502
-      // (none)
-
-      // Amplitude(s) for diagram number 502
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-
-      // *** DIAGRAM 503 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 503
-      // (none)
-
-      // Amplitude(s) for diagram number 503
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 504 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 504
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
-
-      // Amplitude(s) for diagram number 504
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 505 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 505
-      // (none)
-
-      // Amplitude(s) for diagram number 505
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-
-      // *** DIAGRAM 506 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 506
-      // (none)
-
-      // Amplitude(s) for diagram number 506
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-
-      // *** DIAGRAM 507 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 507
-      // (none)
-
-      // Amplitude(s) for diagram number 507
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-
-      // *** DIAGRAM 508 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 508
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] );
-
-      // Amplitude(s) for diagram number 508
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 509 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 509
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] );
-
-      // Amplitude(s) for diagram number 509
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 510 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 510
-      // (none)
-
-      // Amplitude(s) for diagram number 510
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 511 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 511
-      // (none)
-
-      // Amplitude(s) for diagram number 511
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 512 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 512
-      // (none)
-
-      // Amplitude(s) for diagram number 512
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-
-      // *** DIAGRAM 513 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 513
-      // (none)
-
-      // Amplitude(s) for diagram number 513
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 514 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 514
-      // (none)
-
-      // Amplitude(s) for diagram number 514
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 515 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 515
-      // (none)
-
-      // Amplitude(s) for diagram number 515
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 516 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 516
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
-
-      // Amplitude(s) for diagram number 516
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 517 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 517
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 517
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 518 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 518
-      // (none)
-
-      // Amplitude(s) for diagram number 518
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-
-      // *** DIAGRAM 519 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 519
-      // (none)
-
-      // Amplitude(s) for diagram number 519
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 520 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 520
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
-
-      // Amplitude(s) for diagram number 520
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 521 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 521
-      // (none)
-
-      // Amplitude(s) for diagram number 521
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 522 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 522
-      // (none)
-
-      // Amplitude(s) for diagram number 522
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 523 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 523
-      // (none)
-
-      // Amplitude(s) for diagram number 523
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 524 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 524
-      // (none)
-
-      // Amplitude(s) for diagram number 524
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 525 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 525
-      // (none)
-
-      // Amplitude(s) for diagram number 525
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 526 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 526
-      // (none)
-
-      // Amplitude(s) for diagram number 526
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 527 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 527
-      // (none)
-
-      // Amplitude(s) for diagram number 527
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 528 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 528
-      // (none)
-
-      // Amplitude(s) for diagram number 528
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 529 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 529
-      // (none)
-
-      // Amplitude(s) for diagram number 529
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 530 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 530
-      // (none)
-
-      // Amplitude(s) for diagram number 530
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 531 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 531
-      // (none)
-
-      // Amplitude(s) for diagram number 531
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 532 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 532
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 532
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 533 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 533
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] );
-
-      // Amplitude(s) for diagram number 533
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 534 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 534
-      // (none)
-
-      // Amplitude(s) for diagram number 534
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 535 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 535
-      // (none)
-
-      // Amplitude(s) for diagram number 535
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 536 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 536
-      // (none)
-
-      // Amplitude(s) for diagram number 536
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 537 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 537
-      // (none)
-
-      // Amplitude(s) for diagram number 537
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 538 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 538
-      // (none)
-
-      // Amplitude(s) for diagram number 538
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 539 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 539
-      // (none)
-
-      // Amplitude(s) for diagram number 539
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-
-      // *** DIAGRAM 540 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 540
-      // (none)
-
-      // Amplitude(s) for diagram number 540
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-
-      // *** DIAGRAM 541 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 541
-      // (none)
-
-      // Amplitude(s) for diagram number 541
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-
-      // *** DIAGRAM 542 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 542
-      // (none)
-
-      // Amplitude(s) for diagram number 542
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 543 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 543
-      // (none)
-
-      // Amplitude(s) for diagram number 543
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-
-      // *** DIAGRAM 544 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 544
-      // (none)
-
-      // Amplitude(s) for diagram number 544
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 545 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 545
-      // (none)
-
-      // Amplitude(s) for diagram number 545
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 546 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 546
-      // (none)
-
-      // Amplitude(s) for diagram number 546
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 547 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 547
-      // (none)
-
-      // Amplitude(s) for diagram number 547
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 548 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 548
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 548
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 549 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 549
-      // (none)
-
-      // Amplitude(s) for diagram number 549
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-
-      // *** DIAGRAM 550 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 550
-      // (none)
-
-      // Amplitude(s) for diagram number 550
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 551 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 551
-      // (none)
-
-      // Amplitude(s) for diagram number 551
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 552 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 552
-      // (none)
-
-      // Amplitude(s) for diagram number 552
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-
-      // *** DIAGRAM 553 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 553
-      // (none)
-
-      // Amplitude(s) for diagram number 553
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-
-      // *** DIAGRAM 554 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 554
-      // (none)
-
-      // Amplitude(s) for diagram number 554
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 555 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 555
-      // (none)
-
-      // Amplitude(s) for diagram number 555
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-
-      // *** DIAGRAM 556 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 556
-      // (none)
-
-      // Amplitude(s) for diagram number 556
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 557 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 557
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 557
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 558 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 558
-      // (none)
-
-      // Amplitude(s) for diagram number 558
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 559 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 559
-      // (none)
-
-      // Amplitude(s) for diagram number 559
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 560 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 560
-      // (none)
-
-      // Amplitude(s) for diagram number 560
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 561 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 561
-      // (none)
-
-      // Amplitude(s) for diagram number 561
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 562 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 562
-      // (none)
-
-      // Amplitude(s) for diagram number 562
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 563 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 563
-      // (none)
-
-      // Amplitude(s) for diagram number 563
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 564 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 564
-      // (none)
-
-      // Amplitude(s) for diagram number 564
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 565 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 565
-      // (none)
-
-      // Amplitude(s) for diagram number 565
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 566 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 566
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
-
-      // Amplitude(s) for diagram number 566
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 567 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 567
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
-
-      // Amplitude(s) for diagram number 567
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 568 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 568
-      // (none)
-
-      // Amplitude(s) for diagram number 568
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 569 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 569
-      // (none)
-
-      // Amplitude(s) for diagram number 569
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 570 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 570
-      // (none)
-
-      // Amplitude(s) for diagram number 570
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 571 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 571
-      // (none)
-
-      // Amplitude(s) for diagram number 571
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 572 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 572
-      // (none)
-
-      // Amplitude(s) for diagram number 572
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 573 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 573
-      // (none)
-
-      // Amplitude(s) for diagram number 573
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 574 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 574
-      // (none)
-
-      // Amplitude(s) for diagram number 574
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-
-      // *** DIAGRAM 575 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 575
-      // (none)
-
-      // Amplitude(s) for diagram number 575
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 576 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 576
-      // (none)
-
-      // Amplitude(s) for diagram number 576
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 577 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 577
-      // (none)
-
-      // Amplitude(s) for diagram number 577
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 578 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 578
-      // (none)
-
-      // Amplitude(s) for diagram number 578
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 579 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 579
-      // (none)
-
-      // Amplitude(s) for diagram number 579
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 580 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 580
-      // (none)
-
-      // Amplitude(s) for diagram number 580
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-
-      // *** DIAGRAM 581 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 581
-      // (none)
-
-      // Amplitude(s) for diagram number 581
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 582 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 582
-      // (none)
-
-      // Amplitude(s) for diagram number 582
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 583 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 583
-      // (none)
-
-      // Amplitude(s) for diagram number 583
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 584 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 584
-      // (none)
-
-      // Amplitude(s) for diagram number 584
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 585 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 585
-      // (none)
-
-      // Amplitude(s) for diagram number 585
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 586 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 586
-      // (none)
-
-      // Amplitude(s) for diagram number 586
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 587 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 587
-      // (none)
-
-      // Amplitude(s) for diagram number 587
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 588 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 588
-      // (none)
-
-      // Amplitude(s) for diagram number 588
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 589 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 589
-      // (none)
-
-      // Amplitude(s) for diagram number 589
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 590 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 590
-      // (none)
-
-      // Amplitude(s) for diagram number 590
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 591 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 591
-      // (none)
-
-      // Amplitude(s) for diagram number 591
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 592 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 592
-      // (none)
-
-      // Amplitude(s) for diagram number 592
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 593 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 593
-      // (none)
-
-      // Amplitude(s) for diagram number 593
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 594 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 594
-      // (none)
-
-      // Amplitude(s) for diagram number 594
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-
-      // *** DIAGRAM 595 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 595
-      // (none)
-
-      // Amplitude(s) for diagram number 595
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 596 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 596
-      // (none)
-
-      // Amplitude(s) for diagram number 596
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 597 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 597
-      // (none)
-
-      // Amplitude(s) for diagram number 597
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 598 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 598
-      // (none)
-
-      // Amplitude(s) for diagram number 598
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 599 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 599
-      // (none)
-
-      // Amplitude(s) for diagram number 599
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 600 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 600
-      // (none)
-
-      // Amplitude(s) for diagram number 600
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 601 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 601
-      // (none)
-
-      // Amplitude(s) for diagram number 601
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-
-      // *** DIAGRAM 602 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 602
-      // (none)
-
-      // Amplitude(s) for diagram number 602
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 603 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 603
-      // (none)
-
-      // Amplitude(s) for diagram number 603
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 604 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 604
-      // (none)
-
-      // Amplitude(s) for diagram number 604
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-
-      // *** DIAGRAM 605 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 605
-      // (none)
-
-      // Amplitude(s) for diagram number 605
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 606 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 606
-      // (none)
-
-      // Amplitude(s) for diagram number 606
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 607 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 607
-      // (none)
-
-      // Amplitude(s) for diagram number 607
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 608 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 608
-      // (none)
-
-      // Amplitude(s) for diagram number 608
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 609 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 609
-      // (none)
-
-      // Amplitude(s) for diagram number 609
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 610 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 610
-      // (none)
-
-      // Amplitude(s) for diagram number 610
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-
-      // *** DIAGRAM 611 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 611
-      // (none)
-
-      // Amplitude(s) for diagram number 611
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 612 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 612
-      // (none)
-
-      // Amplitude(s) for diagram number 612
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 613 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 613
-      // (none)
-
-      // Amplitude(s) for diagram number 613
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 614 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 614
-      // (none)
-
-      // Amplitude(s) for diagram number 614
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 615 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 615
-      // (none)
-
-      // Amplitude(s) for diagram number 615
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 616 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 616
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 616
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 617 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 617
-      // (none)
-
-      // Amplitude(s) for diagram number 617
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 618 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 618
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] );
-
-      // Amplitude(s) for diagram number 618
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 619 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 619
-      // (none)
-
-      // Amplitude(s) for diagram number 619
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-
-      // *** DIAGRAM 620 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 620
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 620
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 621 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 621
-      // (none)
-
-      // Amplitude(s) for diagram number 621
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-
-      // *** DIAGRAM 622 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 622
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] );
-
-      // Amplitude(s) for diagram number 622
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 623 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 623
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
-
-      // Amplitude(s) for diagram number 623
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 624 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 624
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
-
-      // Amplitude(s) for diagram number 624
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 625 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 625
-      // (none)
-
-      // Amplitude(s) for diagram number 625
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-
-      // *** DIAGRAM 626 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 626
-      // (none)
-
-      // Amplitude(s) for diagram number 626
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 627 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 627
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
-
-      // Amplitude(s) for diagram number 627
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 628 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 628
-      // (none)
-
-      // Amplitude(s) for diagram number 628
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-
-      // *** DIAGRAM 629 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 629
-      // (none)
-
-      // Amplitude(s) for diagram number 629
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-
-      // *** DIAGRAM 630 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 630
-      // (none)
-
-      // Amplitude(s) for diagram number 630
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-
-      // *** DIAGRAM 631 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 631
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] );
-
-      // Amplitude(s) for diagram number 631
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 632 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 632
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] );
-
-      // Amplitude(s) for diagram number 632
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 633 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 633
-      // (none)
-
-      // Amplitude(s) for diagram number 633
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 634 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 634
-      // (none)
-
-      // Amplitude(s) for diagram number 634
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 635 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 635
-      // (none)
-
-      // Amplitude(s) for diagram number 635
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 636 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 636
-      // (none)
-
-      // Amplitude(s) for diagram number 636
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 637 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 637
-      // (none)
-
-      // Amplitude(s) for diagram number 637
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 638 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 638
-      // (none)
-
-      // Amplitude(s) for diagram number 638
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 639 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 639
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
-
-      // Amplitude(s) for diagram number 639
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 640 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 640
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
-
-      // Amplitude(s) for diagram number 640
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 641 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 641
-      // (none)
-
-      // Amplitude(s) for diagram number 641
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-
-      // *** DIAGRAM 642 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 642
-      // (none)
-
-      // Amplitude(s) for diagram number 642
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 643 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 643
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
-
-      // Amplitude(s) for diagram number 643
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 644 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 644
-      // (none)
-
-      // Amplitude(s) for diagram number 644
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-
-      // *** DIAGRAM 645 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 645
-      // (none)
-
-      // Amplitude(s) for diagram number 645
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-
-      // *** DIAGRAM 646 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 646
-      // (none)
-
-      // Amplitude(s) for diagram number 646
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 647 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 647
-      // (none)
-
-      // Amplitude(s) for diagram number 647
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 648 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 648
-      // (none)
-
-      // Amplitude(s) for diagram number 648
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 649 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 649
-      // (none)
-
-      // Amplitude(s) for diagram number 649
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 650 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 650
-      // (none)
-
-      // Amplitude(s) for diagram number 650
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-
-      // *** DIAGRAM 651 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 651
-      // (none)
-
-      // Amplitude(s) for diagram number 651
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 652 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 652
-      // (none)
-
-      // Amplitude(s) for diagram number 652
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 653 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 653
-      // (none)
-
-      // Amplitude(s) for diagram number 653
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 654 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 654
-      // (none)
-
-      // Amplitude(s) for diagram number 654
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 655 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 655
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 655
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-
-      // *** DIAGRAM 656 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 656
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] );
-
-      // Amplitude(s) for diagram number 656
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 657 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 657
-      // (none)
-
-      // Amplitude(s) for diagram number 657
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 658 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 658
-      // (none)
-
-      // Amplitude(s) for diagram number 658
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 659 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 659
-      // (none)
-
-      // Amplitude(s) for diagram number 659
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-
-      // *** DIAGRAM 660 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 660
-      // (none)
-
-      // Amplitude(s) for diagram number 660
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 661 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 661
-      // (none)
-
-      // Amplitude(s) for diagram number 661
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 662 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 662
-      // (none)
-
-      // Amplitude(s) for diagram number 662
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-
-      // *** DIAGRAM 663 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 663
-      // (none)
-
-      // Amplitude(s) for diagram number 663
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-
-      // *** DIAGRAM 664 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 664
-      // (none)
-
-      // Amplitude(s) for diagram number 664
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-
-      // *** DIAGRAM 665 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 665
-      // (none)
-
-      // Amplitude(s) for diagram number 665
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 666 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 666
-      // (none)
-
-      // Amplitude(s) for diagram number 666
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-
-      // *** DIAGRAM 667 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 667
-      // (none)
-
-      // Amplitude(s) for diagram number 667
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 668 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 668
-      // (none)
-
-      // Amplitude(s) for diagram number 668
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 669 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 669
-      // (none)
-
-      // Amplitude(s) for diagram number 669
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 670 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 670
-      // (none)
-
-      // Amplitude(s) for diagram number 670
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 671 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 671
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 671
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 672 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 672
-      // (none)
-
-      // Amplitude(s) for diagram number 672
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 673 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 673
-      // (none)
-
-      // Amplitude(s) for diagram number 673
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 674 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 674
-      // (none)
-
-      // Amplitude(s) for diagram number 674
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 675 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 675
-      // (none)
-
-      // Amplitude(s) for diagram number 675
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-
-      // *** DIAGRAM 676 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 676
-      // (none)
-
-      // Amplitude(s) for diagram number 676
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-
-      // *** DIAGRAM 677 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 677
-      // (none)
-
-      // Amplitude(s) for diagram number 677
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 678 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 678
-      // (none)
-
-      // Amplitude(s) for diagram number 678
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 679 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 679
-      // (none)
-
-      // Amplitude(s) for diagram number 679
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-
-      // *** DIAGRAM 680 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 680
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 680
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 681 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 681
-      // (none)
-
-      // Amplitude(s) for diagram number 681
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-
-      // *** DIAGRAM 682 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 682
-      // (none)
-
-      // Amplitude(s) for diagram number 682
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-
-      // *** DIAGRAM 683 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 683
-      // (none)
-
-      // Amplitude(s) for diagram number 683
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 684 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 684
-      // (none)
-
-      // Amplitude(s) for diagram number 684
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-
-      // *** DIAGRAM 685 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 685
-      // (none)
-
-      // Amplitude(s) for diagram number 685
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-
-      // *** DIAGRAM 686 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 686
-      // (none)
-
-      // Amplitude(s) for diagram number 686
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 687 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 687
-      // (none)
-
-      // Amplitude(s) for diagram number 687
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 688 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 688
-      // (none)
-
-      // Amplitude(s) for diagram number 688
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 689 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 689
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] );
-
-      // Amplitude(s) for diagram number 689
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-
-      // *** DIAGRAM 690 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 690
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 690
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 691 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 691
-      // (none)
-
-      // Amplitude(s) for diagram number 691
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 692 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 692
-      // (none)
-
-      // Amplitude(s) for diagram number 692
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 693 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 693
-      // (none)
-
-      // Amplitude(s) for diagram number 693
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 694 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 694
-      // (none)
-
-      // Amplitude(s) for diagram number 694
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 695 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 695
-      // (none)
-
-      // Amplitude(s) for diagram number 695
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 696 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 696
-      // (none)
-
-      // Amplitude(s) for diagram number 696
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 697 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 697
-      // (none)
-
-      // Amplitude(s) for diagram number 697
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-
-      // *** DIAGRAM 698 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 698
-      // (none)
-
-      // Amplitude(s) for diagram number 698
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 699 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 699
-      // (none)
-
-      // Amplitude(s) for diagram number 699
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 700 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 700
-      // (none)
-
-      // Amplitude(s) for diagram number 700
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-
-      // *** DIAGRAM 701 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 701
-      // (none)
-
-      // Amplitude(s) for diagram number 701
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 702 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 702
-      // (none)
-
-      // Amplitude(s) for diagram number 702
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 703 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 703
-      // (none)
-
-      // Amplitude(s) for diagram number 703
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-
-      // *** DIAGRAM 704 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 704
-      // (none)
-
-      // Amplitude(s) for diagram number 704
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 705 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 705
-      // (none)
-
-      // Amplitude(s) for diagram number 705
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 706 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 706
-      // (none)
-
-      // Amplitude(s) for diagram number 706
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 707 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 707
-      // (none)
-
-      // Amplitude(s) for diagram number 707
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-
-      // *** DIAGRAM 708 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 708
-      // (none)
-
-      // Amplitude(s) for diagram number 708
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 709 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 709
-      // (none)
-
-      // Amplitude(s) for diagram number 709
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 710 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 710
-      // (none)
-
-      // Amplitude(s) for diagram number 710
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-
-      // *** DIAGRAM 711 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 711
-      // (none)
-
-      // Amplitude(s) for diagram number 711
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 712 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 712
-      // (none)
-
-      // Amplitude(s) for diagram number 712
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 713 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 713
-      // (none)
-
-      // Amplitude(s) for diagram number 713
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-
-      // *** DIAGRAM 714 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 714
-      // (none)
-
-      // Amplitude(s) for diagram number 714
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 715 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 715
-      // (none)
-
-      // Amplitude(s) for diagram number 715
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 716 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 716
-      // (none)
-
-      // Amplitude(s) for diagram number 716
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 717 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 717
-      // (none)
-
-      // Amplitude(s) for diagram number 717
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-
-      // *** DIAGRAM 718 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 718
-      // (none)
-
-      // Amplitude(s) for diagram number 718
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 719 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 719
-      // (none)
-
-      // Amplitude(s) for diagram number 719
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 720 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 720
-      // (none)
-
-      // Amplitude(s) for diagram number 720
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 721 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 721
-      // (none)
-
-      // Amplitude(s) for diagram number 721
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 722 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 722
-      // (none)
-
-      // Amplitude(s) for diagram number 722
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 723 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 723
-      // (none)
-
-      // Amplitude(s) for diagram number 723
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 724 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 724
-      // (none)
-
-      // Amplitude(s) for diagram number 724
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-
-      // *** DIAGRAM 725 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 725
-      // (none)
-
-      // Amplitude(s) for diagram number 725
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 726 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 726
-      // (none)
-
-      // Amplitude(s) for diagram number 726
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 727 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 727
-      // (none)
-
-      // Amplitude(s) for diagram number 727
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-
-      // *** DIAGRAM 728 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 728
-      // (none)
-
-      // Amplitude(s) for diagram number 728
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 729 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 729
-      // (none)
-
-      // Amplitude(s) for diagram number 729
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 730 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 730
-      // (none)
-
-      // Amplitude(s) for diagram number 730
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 731 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 731
-      // (none)
-
-      // Amplitude(s) for diagram number 731
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 732 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 732
-      // (none)
-
-      // Amplitude(s) for diagram number 732
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 733 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 733
-      // (none)
-
-      // Amplitude(s) for diagram number 733
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 734 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 734
-      // (none)
-
-      // Amplitude(s) for diagram number 734
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 735 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 735
-      // (none)
-
-      // Amplitude(s) for diagram number 735
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 736 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 736
-      // (none)
-
-      // Amplitude(s) for diagram number 736
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 737 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 737
-      // (none)
-
-      // Amplitude(s) for diagram number 737
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 738 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 738
-      // (none)
-
-      // Amplitude(s) for diagram number 738
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 739 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 739
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] );
-
-      // Amplitude(s) for diagram number 739
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[29] -= amp_sv[0];
-
-      // *** DIAGRAM 740 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 740
-      // (none)
-
-      // Amplitude(s) for diagram number 740
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[27] -= amp_sv[0];
-
-      // *** DIAGRAM 741 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 741
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 741
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] -= amp_sv[0];
-
-      // *** DIAGRAM 742 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 742
-      // (none)
-
-      // Amplitude(s) for diagram number 742
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[46] -= amp_sv[0];
-
-      // *** DIAGRAM 743 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 743
-      // (none)
-
-      // Amplitude(s) for diagram number 743
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[37] -= amp_sv[0];
-
-      // *** DIAGRAM 744 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 744
-      // (none)
-
-      // Amplitude(s) for diagram number 744
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[43] -= amp_sv[0];
-
-      // *** DIAGRAM 745 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 745
-      // (none)
-
-      // Amplitude(s) for diagram number 745
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 746 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 746
-      // (none)
-
-      // Amplitude(s) for diagram number 746
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 747 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 747
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] );
-
-      // Amplitude(s) for diagram number 747
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-
-      // *** DIAGRAM 748 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 748
-      // (none)
-
-      // Amplitude(s) for diagram number 748
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[28] -= amp_sv[0];
-
-      // *** DIAGRAM 749 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 749
-      // (none)
-
-      // Amplitude(s) for diagram number 749
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] -= amp_sv[0];
-
-      // *** DIAGRAM 750 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 750
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
-
-      // Amplitude(s) for diagram number 750
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] -= amp_sv[0];
-
-      // *** DIAGRAM 751 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 751
-      // (none)
-
-      // Amplitude(s) for diagram number 751
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[44] -= amp_sv[0];
-
-      // *** DIAGRAM 752 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 752
-      // (none)
-
-      // Amplitude(s) for diagram number 752
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[31] -= amp_sv[0];
-
-      // *** DIAGRAM 753 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 753
-      // (none)
-
-      // Amplitude(s) for diagram number 753
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] -= amp_sv[0];
-
-      // *** DIAGRAM 754 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 754
-      // (none)
-
-      // Amplitude(s) for diagram number 754
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 755 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 755
-      // (none)
-
-      // Amplitude(s) for diagram number 755
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 756 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 756
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] );
-
-      // Amplitude(s) for diagram number 756
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-
-      // *** DIAGRAM 757 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 757
-      // (none)
-
-      // Amplitude(s) for diagram number 757
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[26] -= amp_sv[0];
-
-      // *** DIAGRAM 758 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 758
-      // (none)
-
-      // Amplitude(s) for diagram number 758
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] -= amp_sv[0];
-
-      // *** DIAGRAM 759 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 759
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
-
-      // Amplitude(s) for diagram number 759
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] -= amp_sv[0];
-
-      // *** DIAGRAM 760 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 760
-      // (none)
-
-      // Amplitude(s) for diagram number 760
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[38] -= amp_sv[0];
-
-      // *** DIAGRAM 761 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 761
-      // (none)
-
-      // Amplitude(s) for diagram number 761
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] -= amp_sv[0];
-
-      // *** DIAGRAM 762 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 762
-      // (none)
-
-      // Amplitude(s) for diagram number 762
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] -= amp_sv[0];
-
-      // *** DIAGRAM 763 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 763
-      // (none)
-
-      // Amplitude(s) for diagram number 763
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 764 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 764
-      // (none)
-
-      // Amplitude(s) for diagram number 764
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 765 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 765
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] );
-
-      // Amplitude(s) for diagram number 765
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-
-      // *** DIAGRAM 766 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 766
-      // (none)
-
-      // Amplitude(s) for diagram number 766
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 767 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 767
-      // (none)
-
-      // Amplitude(s) for diagram number 767
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 768 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 768
-      // (none)
-
-      // Amplitude(s) for diagram number 768
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 769 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 769
-      // (none)
-
-      // Amplitude(s) for diagram number 769
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 770 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 770
-      // (none)
-
-      // Amplitude(s) for diagram number 770
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 771 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 771
-      // (none)
-
-      // Amplitude(s) for diagram number 771
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 772 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 772
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 772
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 773 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 773
-      // (none)
-
-      // Amplitude(s) for diagram number 773
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 774 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 774
-      // (none)
-
-      // Amplitude(s) for diagram number 774
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-
-      // *** DIAGRAM 775 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 775
-      // (none)
-
-      // Amplitude(s) for diagram number 775
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 776 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 776
-      // (none)
-
-      // Amplitude(s) for diagram number 776
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-
-      // *** DIAGRAM 777 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 777
-      // (none)
-
-      // Amplitude(s) for diagram number 777
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 778 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 778
-      // (none)
-
-      // Amplitude(s) for diagram number 778
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 779 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 779
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
-
-      // Amplitude(s) for diagram number 779
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 780 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 780
-      // (none)
-
-      // Amplitude(s) for diagram number 780
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 781 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 781
-      // (none)
-
-      // Amplitude(s) for diagram number 781
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 782 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 782
-      // (none)
-
-      // Amplitude(s) for diagram number 782
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 783 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 783
-      // (none)
-
-      // Amplitude(s) for diagram number 783
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-
-      // *** DIAGRAM 784 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 784
-      // (none)
-
-      // Amplitude(s) for diagram number 784
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 785 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 785
-      // (none)
-
-      // Amplitude(s) for diagram number 785
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 786 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 786
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 786
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 787 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 787
-      // (none)
-
-      // Amplitude(s) for diagram number 787
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-
-      // *** DIAGRAM 788 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 788
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] );
-
-      // Amplitude(s) for diagram number 788
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 789 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 789
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
-
-      // Amplitude(s) for diagram number 789
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] -= amp_sv[0];
-
-      // *** DIAGRAM 790 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 790
-      // (none)
-
-      // Amplitude(s) for diagram number 790
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[70] -= amp_sv[0];
-
-      // *** DIAGRAM 791 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 791
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
-
-      // Amplitude(s) for diagram number 791
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[53] -= amp_sv[0];
-
-      // *** DIAGRAM 792 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 792
-      // (none)
-
-      // Amplitude(s) for diagram number 792
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[51] -= amp_sv[0];
-
-      // *** DIAGRAM 793 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 793
-      // (none)
-
-      // Amplitude(s) for diagram number 793
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[67] -= amp_sv[0];
-
-      // *** DIAGRAM 794 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 794
-      // (none)
-
-      // Amplitude(s) for diagram number 794
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[61] -= amp_sv[0];
-
-      // *** DIAGRAM 795 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 795
-      // (none)
-
-      // Amplitude(s) for diagram number 795
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 796 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 796
-      // (none)
-
-      // Amplitude(s) for diagram number 796
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 797 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 797
-      // (none)
-
-      // Amplitude(s) for diagram number 797
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-
-      // *** DIAGRAM 798 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 798
-      // (none)
-
-      // Amplitude(s) for diagram number 798
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[88] -= amp_sv[0];
-
-      // *** DIAGRAM 799 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 799
-      // (none)
-
-      // Amplitude(s) for diagram number 799
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[94] -= amp_sv[0];
-
-      // *** DIAGRAM 800 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 800
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
-
-      // Amplitude(s) for diagram number 800
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[77] -= amp_sv[0];
-
-      // *** DIAGRAM 801 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 801
-      // (none)
-
-      // Amplitude(s) for diagram number 801
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[75] -= amp_sv[0];
-
-      // *** DIAGRAM 802 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 802
-      // (none)
-
-      // Amplitude(s) for diagram number 802
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[91] -= amp_sv[0];
-
-      // *** DIAGRAM 803 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 803
-      // (none)
-
-      // Amplitude(s) for diagram number 803
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[85] -= amp_sv[0];
-
-      // *** DIAGRAM 804 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 804
-      // (none)
-
-      // Amplitude(s) for diagram number 804
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 805 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 805
-      // (none)
-
-      // Amplitude(s) for diagram number 805
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 806 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 806
-      // (none)
-
-      // Amplitude(s) for diagram number 806
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-
-      // *** DIAGRAM 807 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 807
-      // (none)
-
-      // Amplitude(s) for diagram number 807
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 808 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 808
-      // (none)
-
-      // Amplitude(s) for diagram number 808
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 809 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 809
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
-
-      // Amplitude(s) for diagram number 809
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 810 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 810
-      // (none)
-
-      // Amplitude(s) for diagram number 810
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 811 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 811
-      // (none)
-
-      // Amplitude(s) for diagram number 811
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 812 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 812
-      // (none)
-
-      // Amplitude(s) for diagram number 812
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[109] -= amp_sv[0];
-
-      // *** DIAGRAM 813 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 813
-      // (none)
-
-      // Amplitude(s) for diagram number 813
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 814 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 814
-      // (none)
-
-      // Amplitude(s) for diagram number 814
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 815 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 815
-      // (none)
-
-      // Amplitude(s) for diagram number 815
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 816 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 816
-      // (none)
-
-      // Amplitude(s) for diagram number 816
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 817 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 817
-      // (none)
-
-      // Amplitude(s) for diagram number 817
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 818 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 818
-      // (none)
-
-      // Amplitude(s) for diagram number 818
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 819 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 819
-      // (none)
-
-      // Amplitude(s) for diagram number 819
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 820 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 820
-      // (none)
-
-      // Amplitude(s) for diagram number 820
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 821 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 821
-      // (none)
-
-      // Amplitude(s) for diagram number 821
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 822 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 822
-      // (none)
-
-      // Amplitude(s) for diagram number 822
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 823 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 823
-      // (none)
-
-      // Amplitude(s) for diagram number 823
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 824 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 824
-      // (none)
-
-      // Amplitude(s) for diagram number 824
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 825 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 825
-      // (none)
-
-      // Amplitude(s) for diagram number 825
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 826 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 826
-      // (none)
-
-      // Amplitude(s) for diagram number 826
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 827 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 827
-      // (none)
-
-      // Amplitude(s) for diagram number 827
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 828 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 828
-      // (none)
-
-      // Amplitude(s) for diagram number 828
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 829 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 829
-      // (none)
-
-      // Amplitude(s) for diagram number 829
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 830 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 830
-      // (none)
-
-      // Amplitude(s) for diagram number 830
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 831 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 831
-      // (none)
-
-      // Amplitude(s) for diagram number 831
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 832 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 832
-      // (none)
-
-      // Amplitude(s) for diagram number 832
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 833 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 833
-      // (none)
-
-      // Amplitude(s) for diagram number 833
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 834 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 834
-      // (none)
-
-      // Amplitude(s) for diagram number 834
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 835 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 835
-      // (none)
-
-      // Amplitude(s) for diagram number 835
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 836 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 836
-      // (none)
-
-      // Amplitude(s) for diagram number 836
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 837 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 837
-      // (none)
-
-      // Amplitude(s) for diagram number 837
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 838 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 838
-      // (none)
-
-      // Amplitude(s) for diagram number 838
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 839 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 839
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] );
-
-      // Amplitude(s) for diagram number 839
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 840 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 840
-      // (none)
-
-      // Amplitude(s) for diagram number 840
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 841 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 841
-      // (none)
-
-      // Amplitude(s) for diagram number 841
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 842 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 842
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] );
-
-      // Amplitude(s) for diagram number 842
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 843 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 843
-      // (none)
-
-      // Amplitude(s) for diagram number 843
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 844 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 844
-      // (none)
-
-      // Amplitude(s) for diagram number 844
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 845 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 845
-      // (none)
-
-      // Amplitude(s) for diagram number 845
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 846 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 846
-      // (none)
-
-      // Amplitude(s) for diagram number 846
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 847 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 847
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 847
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 848 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 848
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] );
-
-      // Amplitude(s) for diagram number 848
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 849 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 849
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] );
-
-      // Amplitude(s) for diagram number 849
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 850 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 850
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] );
-
-      // Amplitude(s) for diagram number 850
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 851 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 851
-      // (none)
-
-      // Amplitude(s) for diagram number 851
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 852 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 852
-      // (none)
-
-      // Amplitude(s) for diagram number 852
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 853 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 853
-      // (none)
-
-      // Amplitude(s) for diagram number 853
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 854 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 854
-      // (none)
-
-      // Amplitude(s) for diagram number 854
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 855 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 855
-      // (none)
-
-      // Amplitude(s) for diagram number 855
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 856 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 856
-      // (none)
-
-      // Amplitude(s) for diagram number 856
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 857 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 857
-      // (none)
-
-      // Amplitude(s) for diagram number 857
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 858 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 858
-      // (none)
-
-      // Amplitude(s) for diagram number 858
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 859 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 859
-      // (none)
-
-      // Amplitude(s) for diagram number 859
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 860 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 860
-      // (none)
-
-      // Amplitude(s) for diagram number 860
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 861 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 861
-      // (none)
-
-      // Amplitude(s) for diagram number 861
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 862 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 862
-      // (none)
-
-      // Amplitude(s) for diagram number 862
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 863 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 863
-      // (none)
-
-      // Amplitude(s) for diagram number 863
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 864 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 864
-      // (none)
-
-      // Amplitude(s) for diagram number 864
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 865 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 865
-      // (none)
-
-      // Amplitude(s) for diagram number 865
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 866 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 866
-      // (none)
-
-      // Amplitude(s) for diagram number 866
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 867 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 867
-      // (none)
-
-      // Amplitude(s) for diagram number 867
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 868 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 868
-      // (none)
-
-      // Amplitude(s) for diagram number 868
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 869 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 869
-      // (none)
-
-      // Amplitude(s) for diagram number 869
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 870 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 870
-      // (none)
-
-      // Amplitude(s) for diagram number 870
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 871 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 871
-      // (none)
-
-      // Amplitude(s) for diagram number 871
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 872 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 872
-      // (none)
-
-      // Amplitude(s) for diagram number 872
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 873 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 873
-      // (none)
-
-      // Amplitude(s) for diagram number 873
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 874 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 874
-      // (none)
-
-      // Amplitude(s) for diagram number 874
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 875 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 875
-      // (none)
-
-      // Amplitude(s) for diagram number 875
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 876 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 876
-      // (none)
-
-      // Amplitude(s) for diagram number 876
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 877 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 877
-      // (none)
-
-      // Amplitude(s) for diagram number 877
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 878 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 878
-      // (none)
-
-      // Amplitude(s) for diagram number 878
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 879 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 879
-      // (none)
-
-      // Amplitude(s) for diagram number 879
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 880 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 880
-      // (none)
-
-      // Amplitude(s) for diagram number 880
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 881 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 881
-      // (none)
-
-      // Amplitude(s) for diagram number 881
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 882 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 882
-      // (none)
-
-      // Amplitude(s) for diagram number 882
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 883 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 883
-      // (none)
-
-      // Amplitude(s) for diagram number 883
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-
-      // *** DIAGRAM 884 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 884
-      // (none)
-
-      // Amplitude(s) for diagram number 884
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 885 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 885
-      // (none)
-
-      // Amplitude(s) for diagram number 885
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 886 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 886
-      // (none)
-
-      // Amplitude(s) for diagram number 886
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 887 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 887
-      // (none)
-
-      // Amplitude(s) for diagram number 887
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 888 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 888
-      // (none)
-
-      // Amplitude(s) for diagram number 888
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 889 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 889
-      // (none)
-
-      // Amplitude(s) for diagram number 889
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 890 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 890
-      // (none)
-
-      // Amplitude(s) for diagram number 890
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 891 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 891
-      // (none)
-
-      // Amplitude(s) for diagram number 891
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 892 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 892
-      // (none)
-
-      // Amplitude(s) for diagram number 892
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 893 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 893
-      // (none)
-
-      // Amplitude(s) for diagram number 893
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 894 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 894
-      // (none)
-
-      // Amplitude(s) for diagram number 894
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 895 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 895
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] );
-
-      // Amplitude(s) for diagram number 895
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 896 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 896
-      // (none)
-
-      // Amplitude(s) for diagram number 896
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 897 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 897
-      // (none)
-
-      // Amplitude(s) for diagram number 897
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 898 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 898
-      // (none)
-
-      // Amplitude(s) for diagram number 898
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 899 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 899
-      // (none)
-
-      // Amplitude(s) for diagram number 899
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 900 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 900
-      // (none)
-
-      // Amplitude(s) for diagram number 900
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 901 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 901
-      // (none)
-
-      // Amplitude(s) for diagram number 901
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 902 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 902
-      // (none)
-
-      // Amplitude(s) for diagram number 902
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 903 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 903
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 903
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 904 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 904
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] );
-
-      // Amplitude(s) for diagram number 904
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 905 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 905
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
-
-      // Amplitude(s) for diagram number 905
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 906 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 906
-      // (none)
-
-      // Amplitude(s) for diagram number 906
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 907 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 907
-      // (none)
-
-      // Amplitude(s) for diagram number 907
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 908 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 908
-      // (none)
-
-      // Amplitude(s) for diagram number 908
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 909 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 909
-      // (none)
-
-      // Amplitude(s) for diagram number 909
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 910 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 910
-      // (none)
-
-      // Amplitude(s) for diagram number 910
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 911 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 911
-      // (none)
-
-      // Amplitude(s) for diagram number 911
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 912 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 912
-      // (none)
-
-      // Amplitude(s) for diagram number 912
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 913 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 913
-      // (none)
-
-      // Amplitude(s) for diagram number 913
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 914 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 914
-      // (none)
-
-      // Amplitude(s) for diagram number 914
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 915 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 915
-      // (none)
-
-      // Amplitude(s) for diagram number 915
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 916 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 916
-      // (none)
-
-      // Amplitude(s) for diagram number 916
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 917 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 917
-      // (none)
-
-      // Amplitude(s) for diagram number 917
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 918 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 918
-      // (none)
-
-      // Amplitude(s) for diagram number 918
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-
-      // *** DIAGRAM 919 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 919
-      // (none)
-
-      // Amplitude(s) for diagram number 919
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 920 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 920
-      // (none)
-
-      // Amplitude(s) for diagram number 920
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 921 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 921
-      // (none)
-
-      // Amplitude(s) for diagram number 921
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 922 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 922
-      // (none)
-
-      // Amplitude(s) for diagram number 922
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 923 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 923
-      // (none)
-
-      // Amplitude(s) for diagram number 923
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 924 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 924
-      // (none)
-
-      // Amplitude(s) for diagram number 924
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 925 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 925
-      // (none)
-
-      // Amplitude(s) for diagram number 925
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 926 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 926
-      // (none)
-
-      // Amplitude(s) for diagram number 926
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 927 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 927
-      // (none)
-
-      // Amplitude(s) for diagram number 927
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 928 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 928
-      // (none)
-
-      // Amplitude(s) for diagram number 928
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 929 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 929
-      // (none)
-
-      // Amplitude(s) for diagram number 929
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 930 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 930
-      // (none)
-
-      // Amplitude(s) for diagram number 930
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 931 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 931
-      // (none)
-
-      // Amplitude(s) for diagram number 931
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 932 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 932
-      // (none)
-
-      // Amplitude(s) for diagram number 932
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 933 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 933
-      // (none)
-
-      // Amplitude(s) for diagram number 933
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 934 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 934
-      // (none)
-
-      // Amplitude(s) for diagram number 934
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 935 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 935
-      // (none)
-
-      // Amplitude(s) for diagram number 935
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 936 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 936
-      // (none)
-
-      // Amplitude(s) for diagram number 936
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 937 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 937
-      // (none)
-
-      // Amplitude(s) for diagram number 937
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 938 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 938
-      // (none)
-
-      // Amplitude(s) for diagram number 938
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 939 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 939
-      // (none)
-
-      // Amplitude(s) for diagram number 939
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-
-      // *** DIAGRAM 940 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 940
-      // (none)
-
-      // Amplitude(s) for diagram number 940
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 941 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 941
-      // (none)
-
-      // Amplitude(s) for diagram number 941
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-
-      // *** DIAGRAM 942 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 942
-      // (none)
-
-      // Amplitude(s) for diagram number 942
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 943 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 943
-      // (none)
-
-      // Amplitude(s) for diagram number 943
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 944 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 944
-      // (none)
-
-      // Amplitude(s) for diagram number 944
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 945 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 945
-      // (none)
-
-      // Amplitude(s) for diagram number 945
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 946 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 946
-      // (none)
-
-      // Amplitude(s) for diagram number 946
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 947 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 947
-      // (none)
-
-      // Amplitude(s) for diagram number 947
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 948 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 948
-      // (none)
-
-      // Amplitude(s) for diagram number 948
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 949 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 949
-      // (none)
-
-      // Amplitude(s) for diagram number 949
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 950 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 950
-      // (none)
-
-      // Amplitude(s) for diagram number 950
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 951 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 951
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] );
-
-      // Amplitude(s) for diagram number 951
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 952 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 952
-      // (none)
-
-      // Amplitude(s) for diagram number 952
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 953 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 953
-      // (none)
-
-      // Amplitude(s) for diagram number 953
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 954 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 954
-      // (none)
-
-      // Amplitude(s) for diagram number 954
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-
-      // *** DIAGRAM 955 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 955
-      // (none)
-
-      // Amplitude(s) for diagram number 955
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 956 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 956
-      // (none)
-
-      // Amplitude(s) for diagram number 956
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 957 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 957
-      // (none)
-
-      // Amplitude(s) for diagram number 957
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-
-      // *** DIAGRAM 958 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 958
-      // (none)
-
-      // Amplitude(s) for diagram number 958
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 959 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 959
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 959
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 960 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 960
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] );
-
-      // Amplitude(s) for diagram number 960
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 961 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 961
-      // (none)
-
-      // Amplitude(s) for diagram number 961
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 962 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 962
-      // (none)
-
-      // Amplitude(s) for diagram number 962
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-
-      // *** DIAGRAM 963 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 963
-      // (none)
-
-      // Amplitude(s) for diagram number 963
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 964 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 964
-      // (none)
-
-      // Amplitude(s) for diagram number 964
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 965 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 965
-      // (none)
-
-      // Amplitude(s) for diagram number 965
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 966 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 966
-      // (none)
-
-      // Amplitude(s) for diagram number 966
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 967 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 967
-      // (none)
-
-      // Amplitude(s) for diagram number 967
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 968 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 968
-      // (none)
-
-      // Amplitude(s) for diagram number 968
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-
-      // *** DIAGRAM 969 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 969
-      // (none)
-
-      // Amplitude(s) for diagram number 969
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 970 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 970
-      // (none)
-
-      // Amplitude(s) for diagram number 970
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-
-      // *** DIAGRAM 971 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 971
-      // (none)
-
-      // Amplitude(s) for diagram number 971
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 972 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 972
-      // (none)
-
-      // Amplitude(s) for diagram number 972
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 973 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 973
-      // (none)
-
-      // Amplitude(s) for diagram number 973
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 974 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 974
-      // (none)
-
-      // Amplitude(s) for diagram number 974
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-
-      // *** DIAGRAM 975 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 975
-      // (none)
-
-      // Amplitude(s) for diagram number 975
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 976 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 976
-      // (none)
-
-      // Amplitude(s) for diagram number 976
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 977 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 977
-      // (none)
-
-      // Amplitude(s) for diagram number 977
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 978 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 978
-      // (none)
-
-      // Amplitude(s) for diagram number 978
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 979 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 979
-      // (none)
-
-      // Amplitude(s) for diagram number 979
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 980 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 980
-      // (none)
-
-      // Amplitude(s) for diagram number 980
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-
-      // *** DIAGRAM 981 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 981
-      // (none)
-
-      // Amplitude(s) for diagram number 981
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 982 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 982
-      // (none)
-
-      // Amplitude(s) for diagram number 982
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 983 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 983
-      // (none)
-
-      // Amplitude(s) for diagram number 983
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 984 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 984
-      // (none)
-
-      // Amplitude(s) for diagram number 984
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-
-      // *** DIAGRAM 985 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 985
-      // (none)
-
-      // Amplitude(s) for diagram number 985
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 986 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 986
-      // (none)
-
-      // Amplitude(s) for diagram number 986
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 987 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 987
-      // (none)
-
-      // Amplitude(s) for diagram number 987
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 988 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 988
-      // (none)
-
-      // Amplitude(s) for diagram number 988
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 989 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 989
-      // (none)
-
-      // Amplitude(s) for diagram number 989
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 990 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 990
-      // (none)
-
-      // Amplitude(s) for diagram number 990
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 991 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 991
-      // (none)
-
-      // Amplitude(s) for diagram number 991
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 992 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 992
-      // (none)
-
-      // Amplitude(s) for diagram number 992
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 993 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 993
-      // (none)
-
-      // Amplitude(s) for diagram number 993
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 994 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 994
-      // (none)
-
-      // Amplitude(s) for diagram number 994
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 995 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 995
-      // (none)
-
-      // Amplitude(s) for diagram number 995
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-
-      // *** DIAGRAM 996 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 996
-      // (none)
-
-      // Amplitude(s) for diagram number 996
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 997 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 997
-      // (none)
-
-      // Amplitude(s) for diagram number 997
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-
-      // *** DIAGRAM 998 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 998
-      // (none)
-
-      // Amplitude(s) for diagram number 998
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 999 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 999
-      // (none)
-
-      // Amplitude(s) for diagram number 999
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1000 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1000
-      // (none)
-
-      // Amplitude(s) for diagram number 1000
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1001 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1001
-      // (none)
-
-      // Amplitude(s) for diagram number 1001
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1002 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1002
-      // (none)
-
-      // Amplitude(s) for diagram number 1002
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1003 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1003
-      // (none)
-
-      // Amplitude(s) for diagram number 1003
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1004 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1004
-      // (none)
-
-      // Amplitude(s) for diagram number 1004
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1005 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1005
-      // (none)
-
-      // Amplitude(s) for diagram number 1005
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 1006 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1006
-      // (none)
-
-      // Amplitude(s) for diagram number 1006
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 1007 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1007
-      // (none)
-
-      // Amplitude(s) for diagram number 1007
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1008 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1008
-      // (none)
-
-      // Amplitude(s) for diagram number 1008
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1009 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1009
-      // (none)
-
-      // Amplitude(s) for diagram number 1009
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1010 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1010
-      // (none)
-
-      // Amplitude(s) for diagram number 1010
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1011 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1011
-      // (none)
-
-      // Amplitude(s) for diagram number 1011
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1012 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1012
-      // (none)
-
-      // Amplitude(s) for diagram number 1012
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 1013 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1013
-      // (none)
-
-      // Amplitude(s) for diagram number 1013
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1014 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1014
-      // (none)
-
-      // Amplitude(s) for diagram number 1014
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1015 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1015
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] );
-
-      // Amplitude(s) for diagram number 1015
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1016 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1016
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 1016
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1017 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1017
-      // (none)
-
-      // Amplitude(s) for diagram number 1017
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1018 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1018
-      // (none)
-
-      // Amplitude(s) for diagram number 1018
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1019 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1019
-      // (none)
-
-      // Amplitude(s) for diagram number 1019
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 1020 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1020
-      // (none)
-
-      // Amplitude(s) for diagram number 1020
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 1021 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1021
-      // (none)
-
-      // Amplitude(s) for diagram number 1021
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 1022 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1022
-      // (none)
-
-      // Amplitude(s) for diagram number 1022
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1023 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1023
-      // (none)
-
-      // Amplitude(s) for diagram number 1023
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 1024 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1024
-      // (none)
-
-      // Amplitude(s) for diagram number 1024
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1025 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1025
-      // (none)
-
-      // Amplitude(s) for diagram number 1025
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 1026 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1026
-      // (none)
-
-      // Amplitude(s) for diagram number 1026
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1027 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1027
-      // (none)
-
-      // Amplitude(s) for diagram number 1027
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 1028 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1028
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 1028
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1029 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1029
-      // (none)
-
-      // Amplitude(s) for diagram number 1029
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 1030 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1030
-      // (none)
-
-      // Amplitude(s) for diagram number 1030
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1031 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1031
-      // (none)
-
-      // Amplitude(s) for diagram number 1031
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 1032 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1032
-      // (none)
-
-      // Amplitude(s) for diagram number 1032
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1033 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1033
-      // (none)
-
-      // Amplitude(s) for diagram number 1033
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1034 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1034
-      // (none)
-
-      // Amplitude(s) for diagram number 1034
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 1035 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1035
-      // (none)
-
-      // Amplitude(s) for diagram number 1035
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 1036 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1036
-      // (none)
-
-      // Amplitude(s) for diagram number 1036
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1037 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1037
-      // (none)
-
-      // Amplitude(s) for diagram number 1037
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1038 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1038
-      // (none)
-
-      // Amplitude(s) for diagram number 1038
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 1039 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1039
-      // (none)
-
-      // Amplitude(s) for diagram number 1039
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1040 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1040
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 1040
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1041 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1041
-      // (none)
-
-      // Amplitude(s) for diagram number 1041
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 1042 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1042
-      // (none)
-
-      // Amplitude(s) for diagram number 1042
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1043 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1043
-      // (none)
-
-      // Amplitude(s) for diagram number 1043
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1044 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1044
-      // (none)
-
-      // Amplitude(s) for diagram number 1044
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1045 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1045
-      // (none)
-
-      // Amplitude(s) for diagram number 1045
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1046 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1046
-      // (none)
-
-      // Amplitude(s) for diagram number 1046
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[52] -= amp_sv[0];
-
-      // *** DIAGRAM 1047 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1047
-      // (none)
-
-      // Amplitude(s) for diagram number 1047
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[49] -= amp_sv[0];
-
-      // *** DIAGRAM 1048 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1048
-      // (none)
-
-      // Amplitude(s) for diagram number 1048
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[58] -= amp_sv[0];
-
-      // *** DIAGRAM 1049 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1049
-      // (none)
-
-      // Amplitude(s) for diagram number 1049
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[68] -= amp_sv[0];
-
-      // *** DIAGRAM 1050 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1050
-      // (none)
-
-      // Amplitude(s) for diagram number 1050
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[55] -= amp_sv[0];
-
-      // *** DIAGRAM 1051 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1051
-      // (none)
-
-      // Amplitude(s) for diagram number 1051
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] -= amp_sv[0];
-
-      // *** DIAGRAM 1052 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1052
-      // (none)
-
-      // Amplitude(s) for diagram number 1052
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[50] -= amp_sv[0];
-
-      // *** DIAGRAM 1053 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1053
-      // (none)
-
-      // Amplitude(s) for diagram number 1053
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] -= amp_sv[0];
-
-      // *** DIAGRAM 1054 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1054
-      // (none)
-
-      // Amplitude(s) for diagram number 1054
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[56] -= amp_sv[0];
-
-      // *** DIAGRAM 1055 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1055
-      // (none)
-
-      // Amplitude(s) for diagram number 1055
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[62] -= amp_sv[0];
-
-      // *** DIAGRAM 1056 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1056
-      // (none)
-
-      // Amplitude(s) for diagram number 1056
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[54] -= amp_sv[0];
-
-      // *** DIAGRAM 1057 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1057
-      // (none)
-
-      // Amplitude(s) for diagram number 1057
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] -= amp_sv[0];
-
-      // *** DIAGRAM 1058 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1058
-      // (none)
-
-      // Amplitude(s) for diagram number 1058
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 1059 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1059
-      // (none)
-
-      // Amplitude(s) for diagram number 1059
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1060 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1060
-      // (none)
-
-      // Amplitude(s) for diagram number 1060
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 1061 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1061
-      // (none)
-
-      // Amplitude(s) for diagram number 1061
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1062 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1062
-      // (none)
-
-      // Amplitude(s) for diagram number 1062
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1063 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1063
-      // (none)
-
-      // Amplitude(s) for diagram number 1063
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1064 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1064
-      // (none)
-
-      // Amplitude(s) for diagram number 1064
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1065 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1065
-      // (none)
-
-      // Amplitude(s) for diagram number 1065
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[76] -= amp_sv[0];
-
-      // *** DIAGRAM 1066 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1066
-      // (none)
-
-      // Amplitude(s) for diagram number 1066
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[73] -= amp_sv[0];
-
-      // *** DIAGRAM 1067 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1067
-      // (none)
-
-      // Amplitude(s) for diagram number 1067
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[82] -= amp_sv[0];
-
-      // *** DIAGRAM 1068 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1068
-      // (none)
-
-      // Amplitude(s) for diagram number 1068
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[92] -= amp_sv[0];
-
-      // *** DIAGRAM 1069 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1069
-      // (none)
-
-      // Amplitude(s) for diagram number 1069
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[79] -= amp_sv[0];
-
-      // *** DIAGRAM 1070 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1070
-      // (none)
-
-      // Amplitude(s) for diagram number 1070
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] -= amp_sv[0];
-
-      // *** DIAGRAM 1071 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1071
-      // (none)
-
-      // Amplitude(s) for diagram number 1071
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[74] -= amp_sv[0];
-
-      // *** DIAGRAM 1072 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1072
-      // (none)
-
-      // Amplitude(s) for diagram number 1072
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] -= amp_sv[0];
-
-      // *** DIAGRAM 1073 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1073
-      // (none)
-
-      // Amplitude(s) for diagram number 1073
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[80] -= amp_sv[0];
-
-      // *** DIAGRAM 1074 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1074
-      // (none)
-
-      // Amplitude(s) for diagram number 1074
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[86] -= amp_sv[0];
-
-      // *** DIAGRAM 1075 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1075
-      // (none)
-
-      // Amplitude(s) for diagram number 1075
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[78] -= amp_sv[0];
-
-      // *** DIAGRAM 1076 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1076
-      // (none)
-
-      // Amplitude(s) for diagram number 1076
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[84] -= amp_sv[0];
-
-      // *** DIAGRAM 1077 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1077
-      // (none)
-
-      // Amplitude(s) for diagram number 1077
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 1078 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1078
-      // (none)
-
-      // Amplitude(s) for diagram number 1078
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1079 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1079
-      // (none)
-
-      // Amplitude(s) for diagram number 1079
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 1080 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1080
-      // (none)
-
-      // Amplitude(s) for diagram number 1080
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1081 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1081
-      // (none)
-
-      // Amplitude(s) for diagram number 1081
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1082 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1082
-      // (none)
-
-      // Amplitude(s) for diagram number 1082
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1083 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1083
-      // (none)
-
-      // Amplitude(s) for diagram number 1083
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1084 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1084
-      // (none)
-
-      // Amplitude(s) for diagram number 1084
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 1085 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1085
-      // (none)
-
-      // Amplitude(s) for diagram number 1085
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[97] -= amp_sv[0];
-
-      // *** DIAGRAM 1086 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1086
-      // (none)
-
-      // Amplitude(s) for diagram number 1086
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 1087 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1087
-      // (none)
-
-      // Amplitude(s) for diagram number 1087
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 1088 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1088
-      // (none)
-
-      // Amplitude(s) for diagram number 1088
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1089 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1089
-      // (none)
-
-      // Amplitude(s) for diagram number 1089
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1090 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1090
-      // (none)
-
-      // Amplitude(s) for diagram number 1090
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[98] -= amp_sv[0];
-
-      // *** DIAGRAM 1091 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1091
-      // (none)
-
-      // Amplitude(s) for diagram number 1091
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] -= amp_sv[0];
-
-      // *** DIAGRAM 1092 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1092
-      // (none)
-
-      // Amplitude(s) for diagram number 1092
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[104] -= amp_sv[0];
-
-      // *** DIAGRAM 1093 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1093
-      // (none)
-
-      // Amplitude(s) for diagram number 1093
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1094 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1094
-      // (none)
-
-      // Amplitude(s) for diagram number 1094
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 1095 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1095
-      // (none)
-
-      // Amplitude(s) for diagram number 1095
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1096 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1096
-      // (none)
-
-      // Amplitude(s) for diagram number 1096
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 1097 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1097
-      // (none)
-
-      // Amplitude(s) for diagram number 1097
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1098 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1098
-      // (none)
-
-      // Amplitude(s) for diagram number 1098
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 1099 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1099
-      // (none)
-
-      // Amplitude(s) for diagram number 1099
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1100 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1100
-      // (none)
-
-      // Amplitude(s) for diagram number 1100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1101 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1101
-      // (none)
-
-      // Amplitude(s) for diagram number 1101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1102 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1102
-      // (none)
-
-      // Amplitude(s) for diagram number 1102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1103 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1103
-      // (none)
-
-      // Amplitude(s) for diagram number 1103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1104 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1104
-      // (none)
-
-      // Amplitude(s) for diagram number 1104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1105 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1105
-      // (none)
-
-      // Amplitude(s) for diagram number 1105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 1106 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1106
-      // (none)
-
-      // Amplitude(s) for diagram number 1106
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1107 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1107
-      // (none)
-
-      // Amplitude(s) for diagram number 1107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1108 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1108
-      // (none)
-
-      // Amplitude(s) for diagram number 1108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1109 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1109
-      // (none)
-
-      // Amplitude(s) for diagram number 1109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1110 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1110
-      // (none)
-
-      // Amplitude(s) for diagram number 1110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1111 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1111
-      // (none)
-
-      // Amplitude(s) for diagram number 1111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1112 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1112
-      // (none)
-
-      // Amplitude(s) for diagram number 1112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 1113 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1113
-      // (none)
-
-      // Amplitude(s) for diagram number 1113
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1114 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1114
-      // (none)
-
-      // Amplitude(s) for diagram number 1114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1115 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1115
-      // (none)
-
-      // Amplitude(s) for diagram number 1115
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1116 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1116
-      // (none)
-
-      // Amplitude(s) for diagram number 1116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1117 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1117
-      // (none)
-
-      // Amplitude(s) for diagram number 1117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 1118 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1118
-      // (none)
-
-      // Amplitude(s) for diagram number 1118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1119 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1119
-      // (none)
-
-      // Amplitude(s) for diagram number 1119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-
-      // *** DIAGRAM 1120 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1120
-      // (none)
-
-      // Amplitude(s) for diagram number 1120
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1121 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1121
-      // (none)
-
-      // Amplitude(s) for diagram number 1121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1122 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1122
-      // (none)
-
-      // Amplitude(s) for diagram number 1122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1123 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1123
-      // (none)
-
-      // Amplitude(s) for diagram number 1123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1124 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1124
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] );
-
-      // Amplitude(s) for diagram number 1124
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1125 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1125
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] );
-
-      // Amplitude(s) for diagram number 1125
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1126 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1126
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 1126
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1127 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1127
-      // (none)
-
-      // Amplitude(s) for diagram number 1127
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1128 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1128
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
-
-      // Amplitude(s) for diagram number 1128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-
-      // *** DIAGRAM 1129 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1129
-      // (none)
-
-      // Amplitude(s) for diagram number 1129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1130 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1130
-      // (none)
-
-      // Amplitude(s) for diagram number 1130
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-
-      // *** DIAGRAM 1131 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1131
-      // (none)
-
-      // Amplitude(s) for diagram number 1131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1132 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1132
-      // (none)
-
-      // Amplitude(s) for diagram number 1132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1133 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1133
-      // (none)
-
-      // Amplitude(s) for diagram number 1133
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1134 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-
-      // Amplitude(s) for diagram number 1134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-
-      // *** DIAGRAM 1135 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1135
-      // (none)
-
-      // Amplitude(s) for diagram number 1135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1136 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1136
-      // (none)
-
-      // Amplitude(s) for diagram number 1136
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-
-      // *** DIAGRAM 1137 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1137
-      // (none)
-
-      // Amplitude(s) for diagram number 1137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1138 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1138
-      // (none)
-
-      // Amplitude(s) for diagram number 1138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1139 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1139
-      // (none)
-
-      // Amplitude(s) for diagram number 1139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1140 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1140
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 1140
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1141 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 1141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1142 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1142
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] );
-
-      // Amplitude(s) for diagram number 1142
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1143 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1143
-      // (none)
-
-      // Amplitude(s) for diagram number 1143
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 1144 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1144
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 1144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-
-      // *** DIAGRAM 1145 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1145
-      // (none)
-
-      // Amplitude(s) for diagram number 1145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1146 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1146
-      // (none)
-
-      // Amplitude(s) for diagram number 1146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-
-      // *** DIAGRAM 1147 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1147
-      // (none)
-
-      // Amplitude(s) for diagram number 1147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 1148 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1148
-      // (none)
-
-      // Amplitude(s) for diagram number 1148
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1149 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1149
-      // (none)
-
-      // Amplitude(s) for diagram number 1149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 1150 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1150
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
-
-      // Amplitude(s) for diagram number 1150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-
-      // *** DIAGRAM 1151 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1151
-      // (none)
-
-      // Amplitude(s) for diagram number 1151
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1152 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1152
-      // (none)
-
-      // Amplitude(s) for diagram number 1152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-
-      // *** DIAGRAM 1153 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1153
-      // (none)
-
-      // Amplitude(s) for diagram number 1153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1154 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1154
-      // (none)
-
-      // Amplitude(s) for diagram number 1154
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1155 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1155
-      // (none)
-
-      // Amplitude(s) for diagram number 1155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1156 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1156
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 1156
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1157 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 1157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 1158 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1158
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] );
-
-      // Amplitude(s) for diagram number 1158
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1159 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1159
-      // (none)
-
-      // Amplitude(s) for diagram number 1159
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 1160 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1160
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-
-      // Amplitude(s) for diagram number 1160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-
-      // *** DIAGRAM 1161 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1161
-      // (none)
-
-      // Amplitude(s) for diagram number 1161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1162 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1162
-      // (none)
-
-      // Amplitude(s) for diagram number 1162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-
-      // *** DIAGRAM 1163 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1163
-      // (none)
-
-      // Amplitude(s) for diagram number 1163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-
-      // *** DIAGRAM 1164 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1164
-      // (none)
-
-      // Amplitude(s) for diagram number 1164
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1165 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1165
-      // (none)
-
-      // Amplitude(s) for diagram number 1165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-
-      // *** DIAGRAM 1166 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1166
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
-
-      // Amplitude(s) for diagram number 1166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1167 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1167
-      // (none)
-
-      // Amplitude(s) for diagram number 1167
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1168 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1168
-      // (none)
-
-      // Amplitude(s) for diagram number 1168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 1169 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1169
-      // (none)
-
-      // Amplitude(s) for diagram number 1169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1170 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1170
-      // (none)
-
-      // Amplitude(s) for diagram number 1170
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1171 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1171
-      // (none)
-
-      // Amplitude(s) for diagram number 1171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1172 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1172
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 1172
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-
-      // *** DIAGRAM 1173 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
-
-      // Amplitude(s) for diagram number 1173
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1174 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1174
-      // (none)
-
-      // Amplitude(s) for diagram number 1174
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-
-      // *** DIAGRAM 1175 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1175
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 1175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-
-      // *** DIAGRAM 1176 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1176
-      // (none)
-
-      // Amplitude(s) for diagram number 1176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1177 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1177
-      // (none)
-
-      // Amplitude(s) for diagram number 1177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1178 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1178
-      // (none)
-
-      // Amplitude(s) for diagram number 1178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1179 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1179
-      // (none)
-
-      // Amplitude(s) for diagram number 1179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1180 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1180
-      // (none)
-
-      // Amplitude(s) for diagram number 1180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 1181 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1181
-      // (none)
-
-      // Amplitude(s) for diagram number 1181
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1182 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1182
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 1182
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 1183 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1183
-      // (none)
-
-      // Amplitude(s) for diagram number 1183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1184 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1184
-      // (none)
-
-      // Amplitude(s) for diagram number 1184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1185 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1185
-      // (none)
-
-      // Amplitude(s) for diagram number 1185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 1186 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1186
-      // (none)
-
-      // Amplitude(s) for diagram number 1186
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1187 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1187
-      // (none)
-
-      // Amplitude(s) for diagram number 1187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-
-      // *** DIAGRAM 1188 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1188
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
-
-      // Amplitude(s) for diagram number 1188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-
-      // *** DIAGRAM 1189 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1189
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 1189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1190 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1190
-      // (none)
-
-      // Amplitude(s) for diagram number 1190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-
-      // *** DIAGRAM 1191 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1191
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 1191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-
-      // *** DIAGRAM 1192 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1192
-      // (none)
-
-      // Amplitude(s) for diagram number 1192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1193 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1193
-      // (none)
-
-      // Amplitude(s) for diagram number 1193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-
-      // *** DIAGRAM 1194 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1194
-      // (none)
-
-      // Amplitude(s) for diagram number 1194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1195 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1195
-      // (none)
-
-      // Amplitude(s) for diagram number 1195
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1196 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1196
-      // (none)
-
-      // Amplitude(s) for diagram number 1196
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-
-      // *** DIAGRAM 1197 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1197
-      // (none)
-
-      // Amplitude(s) for diagram number 1197
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1198 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1198
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] );
-
-      // Amplitude(s) for diagram number 1198
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1199 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1199
-      // (none)
-
-      // Amplitude(s) for diagram number 1199
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1200 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1200
-      // (none)
-
-      // Amplitude(s) for diagram number 1200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1201 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1201
-      // (none)
-
-      // Amplitude(s) for diagram number 1201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-
-      // *** DIAGRAM 1202 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1202
-      // (none)
-
-      // Amplitude(s) for diagram number 1202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1203 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1203
-      // (none)
-
-      // Amplitude(s) for diagram number 1203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1204 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1204
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
-
-      // Amplitude(s) for diagram number 1204
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-
-      // *** DIAGRAM 1205 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1205
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 1205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1206 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1206
-      // (none)
-
-      // Amplitude(s) for diagram number 1206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-
-      // *** DIAGRAM 1207 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1207
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
-
-      // Amplitude(s) for diagram number 1207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 1208 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1208
-      // (none)
-
-      // Amplitude(s) for diagram number 1208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1209 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1209
-      // (none)
-
-      // Amplitude(s) for diagram number 1209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-
-      // *** DIAGRAM 1210 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1210
-      // (none)
-
-      // Amplitude(s) for diagram number 1210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1211 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1211
-      // (none)
-
-      // Amplitude(s) for diagram number 1211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1212 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1212
-      // (none)
-
-      // Amplitude(s) for diagram number 1212
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 1213 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1213
-      // (none)
-
-      // Amplitude(s) for diagram number 1213
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 1214 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1214
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 1214
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1215 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1215
-      // (none)
-
-      // Amplitude(s) for diagram number 1215
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 1216 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1216
-      // (none)
-
-      // Amplitude(s) for diagram number 1216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1217 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1217
-      // (none)
-
-      // Amplitude(s) for diagram number 1217
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-
-      // *** DIAGRAM 1218 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1218
-      // (none)
-
-      // Amplitude(s) for diagram number 1218
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1219 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1219
-      // (none)
-
-      // Amplitude(s) for diagram number 1219
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1220 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1220
-      // (none)
-
-      // Amplitude(s) for diagram number 1220
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 1221 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1221
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 1221
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1222 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1222
-      // (none)
-
-      // Amplitude(s) for diagram number 1222
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1223 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1223
-      // (none)
-
-      // Amplitude(s) for diagram number 1223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1224 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1224
-      // (none)
-
-      // Amplitude(s) for diagram number 1224
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 1225 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1225
-      // (none)
-
-      // Amplitude(s) for diagram number 1225
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1226 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1226
-      // (none)
-
-      // Amplitude(s) for diagram number 1226
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-
-      // *** DIAGRAM 1227 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1227
-      // (none)
-
-      // Amplitude(s) for diagram number 1227
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 1228 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1228
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] );
-
-      // Amplitude(s) for diagram number 1228
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1229 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1229
-      // (none)
-
-      // Amplitude(s) for diagram number 1229
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 1230 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1230
-      // (none)
-
-      // Amplitude(s) for diagram number 1230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1231 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1231
-      // (none)
-
-      // Amplitude(s) for diagram number 1231
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-
-      // *** DIAGRAM 1232 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1232
-      // (none)
-
-      // Amplitude(s) for diagram number 1232
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1233 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1233
-      // (none)
-
-      // Amplitude(s) for diagram number 1233
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1234 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1234
-      // (none)
-
-      // Amplitude(s) for diagram number 1234
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 1235 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1235
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] );
-
-      // Amplitude(s) for diagram number 1235
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1236 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1236
-      // (none)
-
-      // Amplitude(s) for diagram number 1236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 1237 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1237
-      // (none)
-
-      // Amplitude(s) for diagram number 1237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1238 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1238
-      // (none)
-
-      // Amplitude(s) for diagram number 1238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-
-      // *** DIAGRAM 1239 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1239
-      // (none)
-
-      // Amplitude(s) for diagram number 1239
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1240 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1240
-      // (none)
-
-      // Amplitude(s) for diagram number 1240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
-
-      // The color matrix (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
-        { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
-        { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
-        { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
-        { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
-        { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
-        { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
-        { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
-        { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
-        { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
-        { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
-        { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
-        { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
-        { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
-        { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
-        { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
-        { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
-        { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
-        { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
-        { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
-        { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
-        { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
-        { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
-        { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
-        { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
-        { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
-        { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
-        { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
-        { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
-        { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
-        { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
-        { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
-        { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
-        { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
-        { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
-        { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
-        { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
-        { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
-        { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
-        { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
-        { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
-        { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
-        { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
-        { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
-        { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
-        { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
-        { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
-        { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
-        { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
-        { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
-        { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
-        { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
-        { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
-        { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
-        { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
-        { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
-        { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
-        { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
-        { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
-        { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
-        { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
-        { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
-        { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
-        { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
-        { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
-        { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
-        { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
-        { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
-        { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
-        { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
-        { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
-        { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
-        { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
-        { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
-        { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
-        { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
-        { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
-        { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
-        { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
-        { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
-        { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
-        { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
-        { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
-        { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
-        { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
-        { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
-        { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
-        { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
-        { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
-        { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
-        { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
-        { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
-        { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
-        { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
-        { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
-        { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
-        { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
-        { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
-        { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
-        { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
-        { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
-        { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
-        { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
-        { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
-        { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
-        { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
-        { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
-        { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
-        { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
-        { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
-        { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
-        { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
-        { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
-        { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
-        { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
-        { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
-        { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
-        { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
-        { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
-        { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 1240 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram241, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram242, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram243, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram244, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram245, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram246, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram247, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram248, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram249, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram250, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram251, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram252, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram253, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram254, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram255, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram256, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram257, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram258, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram259, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram260, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram261, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram262, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram263, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram264, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram265, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram266, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram267, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram268, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram269, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram270, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram271, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram272, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram273, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram274, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram275, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram276, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram277, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram278, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram279, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram280, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram281, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram282, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram283, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram284, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram285, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram286, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram287, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram288, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram289, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram290, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram291, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram292, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram293, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram294, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram295, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram296, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram297, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram298, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram299, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram300, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram301, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram302, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram303, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram304, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram305, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram306, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram307, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram308, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram309, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram310, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram311, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram312, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram313, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram314, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram315, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram316, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram317, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram318, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram319, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram320, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram321, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram322, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram323, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram324, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram325, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram326, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram327, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram328, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram329, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram330, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram331, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram332, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram333, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram334, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram335, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram336, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram337, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram338, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram339, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram340, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram341, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram342, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram343, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram344, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram345, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram346, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram347, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram348, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram349, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram350, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram351, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram352, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram353, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram354, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram355, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram356, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram357, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram358, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram359, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram360, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram361, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram362, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram363, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram364, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram365, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram366, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram367, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram368, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram369, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram370, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram371, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram372, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram373, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram374, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram375, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram376, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram377, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram378, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram379, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram380, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram381, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram382, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram383, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram384, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram385, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram386, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram387, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram388, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram389, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram390, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram391, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram392, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram393, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram394, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram395, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram396, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram397, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram398, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram399, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram400, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram401, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram402, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram403, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram404, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram405, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram406, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram407, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram408, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram409, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram410, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram411, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram412, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram413, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram414, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram415, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram416, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram417, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram418, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram419, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram420, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram421, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram422, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram423, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram424, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram425, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram426, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram427, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram428, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram429, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram430, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram431, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram432, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram433, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram434, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram435, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram436, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram437, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram438, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram439, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram440, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram441, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram442, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram443, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram444, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram445, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram446, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram447, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram448, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram449, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram450, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram451, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram452, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram453, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram454, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram455, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram456, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram457, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram458, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram459, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram460, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram461, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram462, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram463, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram464, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram465, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram466, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram467, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram468, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram469, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram470, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram471, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram472, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram473, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram474, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram475, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram476, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram477, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram478, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram479, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram480, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram481, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram482, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram483, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram484, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram485, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram486, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram487, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram488, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram489, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram490, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram491, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram492, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram493, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram494, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram495, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram496, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram497, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram498, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram499, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram500, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram501, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram502, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram503, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram504, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram505, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram506, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram507, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram508, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram509, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram510, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram511, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram512, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram513, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram514, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram515, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram516, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram517, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram518, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram519, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram520, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram521, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram522, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram523, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram524, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram525, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram526, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram527, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram528, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram529, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram530, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram531, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram532, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram533, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram534, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram535, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram536, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram537, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram538, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram539, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram540, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram541, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram542, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram543, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram544, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram545, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram546, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram547, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram548, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram549, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram550, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram551, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram552, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram553, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram554, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram555, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram556, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram557, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram558, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram559, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram560, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram561, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram562, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram563, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram564, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram565, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram566, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram567, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram568, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram569, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram570, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram571, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram572, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram573, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram574, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram575, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram576, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram577, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram578, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram579, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram580, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram581, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram582, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram583, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram584, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram585, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram586, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram587, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram588, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram589, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram590, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram591, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram592, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram593, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram594, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram595, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram596, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram597, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram598, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram599, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram600, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram601, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram602, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram603, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram604, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram605, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram606, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram607, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram608, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram609, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram610, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram611, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram612, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram613, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram614, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram615, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram616, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram617, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram618, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram619, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram620, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram621, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram622, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram623, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram624, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram625, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram626, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram627, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram628, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram629, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram630, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram631, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram632, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram633, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram634, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram635, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram636, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram637, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram638, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram639, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram640, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram641, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram642, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram643, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram644, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram645, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram646, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram647, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram648, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram649, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram650, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram651, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram652, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram653, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram654, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram655, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram656, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram657, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram658, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram659, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram660, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram661, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram662, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram663, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram664, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram665, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram666, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram667, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram668, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram669, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram670, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram671, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram672, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram673, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram674, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram675, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram676, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram677, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram678, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram679, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram680, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram681, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram682, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram683, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram684, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram685, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram686, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram687, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram688, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram689, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram690, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram691, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram692, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram693, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram694, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram695, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram696, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram697, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram698, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram699, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram700, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram701, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram702, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram703, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram704, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram705, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram706, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram707, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram708, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram709, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram710, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram711, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram712, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram713, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram714, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram715, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram716, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram717, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram718, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram719, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram720, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram721, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram722, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram723, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram724, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram725, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram726, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram727, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram728, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram729, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram730, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram731, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram732, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram733, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram734, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram735, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram736, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram737, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram738, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram739, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram740, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram741, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram742, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram743, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram744, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram745, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram746, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram747, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram748, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram749, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram750, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram751, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram752, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram753, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram754, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram755, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram756, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram757, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram758, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram759, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram760, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram761, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram762, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram763, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram764, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram765, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram766, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram767, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram768, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram769, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram770, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram771, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram772, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram773, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram774, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram775, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram776, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram777, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram778, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram779, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram780, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram781, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram782, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram783, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram784, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram785, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram786, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram787, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram788, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram789, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram790, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram791, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram792, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram793, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram794, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram795, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram796, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram797, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram798, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram799, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram800, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram801, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram802, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram803, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram804, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram805, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram806, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram807, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram808, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram809, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram810, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram811, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram812, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram813, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram814, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram815, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram816, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram817, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram818, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram819, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram820, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram821, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram822, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram823, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram824, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram825, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram826, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram827, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram828, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram829, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram830, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram831, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram832, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram833, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram834, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram835, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram836, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram837, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram838, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram839, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram840, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram841, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram842, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram843, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram844, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram845, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram846, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram847, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram848, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram849, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram850, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram851, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram852, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram853, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram854, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram855, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram856, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram857, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram858, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram859, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram860, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram861, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram862, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram863, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram864, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram865, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram866, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram867, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram868, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram869, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram870, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram871, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram872, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram873, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram874, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram875, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram876, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram877, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram878, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram879, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram880, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram881, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram882, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram883, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram884, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram885, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram886, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram887, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram888, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram889, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram890, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram891, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram892, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram893, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram894, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram895, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram896, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram897, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram898, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram899, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram900, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram901, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram902, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram903, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram904, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram905, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram906, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram907, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram908, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram909, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram910, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram911, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram912, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram913, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram914, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram915, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram916, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram917, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram918, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram919, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram920, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram921, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram922, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram923, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram924, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram925, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram926, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram927, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram928, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram929, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram930, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram931, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram932, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram933, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram934, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram935, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram936, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram937, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram938, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram939, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram940, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram941, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram942, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram943, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram944, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram945, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram946, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram947, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram948, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram949, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram950, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram951, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram952, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram953, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram954, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram955, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram956, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram957, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram958, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram959, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram960, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram961, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram962, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram963, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram964, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram965, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram966, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram967, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram968, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram969, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram970, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram971, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram972, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram973, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram974, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram975, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram976, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram977, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram978, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram979, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram980, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram981, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram982, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram983, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram984, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram985, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram986, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram987, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram988, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram989, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram990, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram991, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram992, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram993, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram994, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram995, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram996, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram997, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram998, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram999, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1000, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1001, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1002, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1003, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1004, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1005, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1006, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1007, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1008, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1009, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1010, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1011, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1012, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1013, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1014, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1015, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1016, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1017, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1018, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1019, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1020, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1021, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1022, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1023, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1024, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1025, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1026, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1027, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1028, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1029, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1030, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1031, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1032, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1033, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1034, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1035, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1036, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1037, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1038, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1039, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1040, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1041, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1042, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1043, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1044, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1045, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1046, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1047, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1048, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1049, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1050, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1051, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1052, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1053, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1054, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1055, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1056, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1057, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1058, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1059, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1060, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1061, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1062, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1063, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1064, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1065, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1066, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1067, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1068, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1069, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1070, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1071, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1072, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1073, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1074, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1075, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1076, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1077, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1078, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1079, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1080, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1081, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1082, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1083, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1084, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1085, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1086, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1087, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1088, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1089, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1090, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1091, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1092, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1093, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1094, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1095, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1096, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1097, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1098, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1099, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram124( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram125( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram126( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram127( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram128( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram129( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram130( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram131( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram132( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram133( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram134( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram135( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram136( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram137( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram138( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram139( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram140( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram141( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram142( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram143( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram144( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram145( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram146( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram147( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram148( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram149( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram150( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram151( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram152( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram153( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram154( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram155( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram156( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram157( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram158( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram159( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram160( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram161( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram162( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram163( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram164( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram165( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram166( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram167( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram168( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram169( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram170( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram171( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram172( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram173( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram174( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram175( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram176( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram177( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram178( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram179( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram180( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram181( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram182( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram183( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram184( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram185( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram186( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram187( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram188( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram189( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram190( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram191( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram192( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram193( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram194( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram195( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram196( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram197( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram198( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram199( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram200( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram201( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram202( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram203( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram204( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram205( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram206( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram207( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram208( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram209( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram210( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram211( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram212( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram213( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram214( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram215( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram216( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram217( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram218( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram219( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram220( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram221( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram222( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram223( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram224( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram225( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram226( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram227( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram228( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram229( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram230( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram231( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram232( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram233( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram234( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram235( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram236( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram237( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram238( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram239( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram240( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram241( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram242( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram243( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram244( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram245( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram246( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram247( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram248( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram249( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram250( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram251( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram252( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram253( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram254( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram255( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram256( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram257( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram258( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram259( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram260( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram261( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram262( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram263( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram264( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram265( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram266( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram267( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram268( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram269( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram270( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram271( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram272( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram273( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram274( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram275( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram276( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram277( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram278( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram279( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram280( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram281( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram282( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram283( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram284( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram285( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram286( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram287( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram288( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram289( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram290( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram291( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram292( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram293( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram294( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram295( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram296( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram297( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram298( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram299( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram300( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram301( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram302( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram303( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram304( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram305( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram306( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram307( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram308( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram309( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram310( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram311( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram312( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram313( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram314( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram315( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram316( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram317( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram318( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram319( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram320( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram321( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram322( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram323( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram324( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram325( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram326( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram327( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram328( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram329( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram330( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram331( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram332( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram333( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram334( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram335( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram336( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram337( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram338( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram339( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram340( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram341( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram342( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram343( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram344( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram345( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram346( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram347( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram348( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram349( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram350( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram351( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram352( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram353( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram354( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram355( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram356( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram357( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram358( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram359( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram360( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram361( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram362( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram363( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram364( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram365( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram366( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram367( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram368( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram369( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram370( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram371( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram372( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram373( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram374( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram375( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram376( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram377( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram378( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram379( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram380( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram381( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram382( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram383( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram384( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram385( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram386( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram387( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram388( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram389( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram390( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram391( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram392( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram393( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram394( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram395( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram396( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram397( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram398( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram399( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram400( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram401( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram402( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram403( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram404( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram405( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram406( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram407( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram408( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram409( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram410( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram411( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram412( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram413( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram414( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram415( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram416( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram417( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram418( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram419( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram420( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram421( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram422( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram423( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram424( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram425( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram426( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram427( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram428( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram429( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram430( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram431( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram432( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram433( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram434( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram435( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram436( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram437( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram438( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram439( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram440( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram441( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram442( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram443( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram444( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram445( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram446( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram447( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram448( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram449( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram450( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram451( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram452( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram453( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram454( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram455( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram456( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram457( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram458( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram459( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram460( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram461( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram462( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram463( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram464( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram465( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram466( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram467( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram468( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram469( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram470( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram471( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram472( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram473( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram474( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram475( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram476( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram477( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram478( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram479( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram480( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram481( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram482( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram483( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram484( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram485( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram486( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram487( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram488( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram489( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram490( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram491( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram492( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram493( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram494( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram495( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram496( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram497( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram498( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram499( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram500( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram501( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram502( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram503( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram504( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram505( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram506( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram507( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram508( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram509( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram510( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram511( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram512( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram513( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram514( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram515( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram516( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram517( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram518( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram519( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram520( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram521( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram522( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram523( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram524( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram525( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram526( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram527( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram528( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram529( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram530( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram531( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram532( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram533( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram534( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram535( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram536( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram537( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram538( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram539( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram540( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram541( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram542( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram543( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram544( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram545( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram546( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram547( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram548( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram549( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram550( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram551( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram552( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram553( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram554( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram555( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram556( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram557( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram558( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram559( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram560( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram561( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram562( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram563( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram564( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram565( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram566( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram567( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram568( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram569( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram570( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram571( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram572( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram573( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram574( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram575( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram576( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram577( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram578( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram579( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram580( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram581( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram582( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram583( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram584( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram585( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram586( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram587( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram588( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram589( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram590( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram591( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram592( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram593( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram594( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram595( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram596( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram597( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram598( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram599( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram600( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram601( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram602( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram603( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram604( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram605( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram606( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram607( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram608( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram609( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram610( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram611( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram612( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram613( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram614( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram615( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram616( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram617( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram618( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram619( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram620( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram621( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram622( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram623( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram624( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram625( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram626( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram627( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram628( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram629( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram630( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram631( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram632( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram633( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram634( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram635( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram636( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram637( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram638( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram639( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram640( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram641( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram642( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram643( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram644( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram645( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram646( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram647( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram648( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram649( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram650( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram651( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram652( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram653( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram654( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram655( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram656( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram657( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram658( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram659( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram660( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram661( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram662( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram663( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram664( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram665( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram666( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram667( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram668( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram669( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram670( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram671( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram672( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram673( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram674( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram675( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram676( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram677( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram678( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram679( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram680( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram681( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram682( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram683( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram684( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram685( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram686( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram687( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram688( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram689( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram690( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram691( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram692( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram693( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram694( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram695( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram696( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram697( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram698( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram699( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram700( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram701( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram702( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram703( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram704( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram705( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram706( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram707( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram708( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram709( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram710( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram711( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram712( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram713( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram714( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram715( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram716( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram717( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram718( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram719( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram720( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram721( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram722( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram723( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram724( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram725( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram726( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram727( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram728( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram729( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram730( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram731( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram732( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram733( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram734( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram735( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram736( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram737( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram738( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram739( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram740( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram741( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram742( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram743( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram744( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram745( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram746( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram747( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram748( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram749( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram750( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram751( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram752( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram753( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram754( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram755( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram756( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram757( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram758( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram759( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram760( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram761( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram762( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram763( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram764( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram765( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram766( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram767( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram768( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram769( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram770( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram771( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram772( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram773( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram774( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram775( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram776( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram777( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram778( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram779( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram780( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram781( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram782( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram783( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram784( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram785( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram786( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram787( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram788( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram789( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram790( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram791( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram792( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram793( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram794( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram795( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram796( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram797( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram798( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram799( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram800( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram801( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram802( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram803( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram804( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram805( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram806( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram807( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram808( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram809( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram810( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram811( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram812( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram813( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram814( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram815( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram816( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram817( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram818( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram819( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram820( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram821( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram822( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram823( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram824( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram825( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram826( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram827( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram828( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram829( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram830( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram831( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram832( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram833( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram834( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram835( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram836( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram837( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram838( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram839( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram840( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram841( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram842( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram843( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram844( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram845( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram846( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram847( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram848( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram849( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram850( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram851( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram852( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram853( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram854( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram855( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram856( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram857( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram858( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram859( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram860( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram861( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram862( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram863( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram864( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram865( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram866( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram867( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram868( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram869( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram870( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram871( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram872( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram873( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram874( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram875( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram876( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram877( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram878( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram879( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram880( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram881( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram882( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram883( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram884( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram885( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram886( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram887( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram888( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram889( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram890( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram891( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram892( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram893( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram894( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram895( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram896( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram897( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram898( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram899( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram900( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram901( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram902( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram903( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram904( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram905( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram906( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram907( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram908( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram909( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram910( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram911( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram912( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram913( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram914( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram915( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram916( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram917( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram918( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram919( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram920( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram921( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram922( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram923( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram924( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram925( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram926( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram927( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram928( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram929( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram930( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram931( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram932( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram933( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram934( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram935( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram936( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram937( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram938( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram939( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram940( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram941( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram942( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram943( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram944( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram945( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram946( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram947( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram948( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram949( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram950( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram951( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram952( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram953( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram954( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram955( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram956( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram957( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram958( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram959( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram960( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram961( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram962( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram963( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram964( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram965( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram966( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram967( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram968( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram969( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram970( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram971( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram972( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram973( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram974( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram975( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram976( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram977( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram978( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram979( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram980( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram981( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram982( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram983( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram984( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram985( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram986( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram987( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram988( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram989( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram990( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram991( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram992( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram993( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram994( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram995( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram996( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram997( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram998( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram999( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1000( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1001( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1002( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1003( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1004( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1005( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1006( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1007( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1008( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1009( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1010( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1011( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1012( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1013( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1014( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1015( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1016( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1017( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1018( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1019( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1020( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1021( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1022( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1023( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1024( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1025( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1026( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1027( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1028( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1029( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1030( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1031( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1032( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1033( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1034( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1035( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1036( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1037( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1038( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1039( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1040( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1041( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1042( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1043( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1044( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1045( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1046( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1047( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1048( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1049( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1050( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1051( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1052( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1053( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1054( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1055( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1056( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1057( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1058( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1059( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1060( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1061( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1062( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1063( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1064( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1065( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1066( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1067( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1068( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1069( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1070( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1071( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1072( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1073( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1074( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1075( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1076( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1077( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1078( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1079( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1080( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1081( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1082( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1083( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1084( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1085( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1086( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1087( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1088( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1089( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1090( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1091( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1092( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1093( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1094( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1095( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1096( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1097( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1098( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1099( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1100( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1101( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1102( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1103( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1104( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1105( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1106( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1107( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1108( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1109( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1110( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1111( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1112( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1113( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1114( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1115( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1116( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1117( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1118( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1119( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1120( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1121( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1122( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1123( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1124( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1125( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1126( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1127( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1128( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1129( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1130( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1131( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1132( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1133( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1134( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1135( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1136( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1137( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1138( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1139( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1140( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1141( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1142( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1143( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1144( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1145( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1146( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1147( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1148( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1149( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1150( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1151( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1152( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1153( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1154( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1155( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1156( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1157( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1158( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1159( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1160( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1161( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1162( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1163( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1164( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1165( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1166( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1167( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1168( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1169( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1170( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1171( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1172( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1173( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1174( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1175( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1176( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1177( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1178( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1179( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1180( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1181( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1182( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1183( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1184( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1185( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1186( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1187( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1188( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1189( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1190( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1191( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1192( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1193( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1194( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1195( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1196( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1197( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1198( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1199( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1200( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1201( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1202( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1203( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1204( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1205( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1206( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1207( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1208( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1209( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1210( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1211( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1212( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1213( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1214( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1215( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1216( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1217( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1218( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1219( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1220( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1221( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1222( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1223( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1224( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1225( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1226( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1227( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1228( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1229( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1230( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1231( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1232( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1233( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1234( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1235( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1236( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1237( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1238( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1239( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1240( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -30383,7 +3036,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -30419,6 +3076,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -30462,6 +3123,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -30564,26 +3229,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -30591,25 +3256,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -30622,7 +3302,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -30632,26 +3312,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -30662,17 +3343,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -30709,35 +3396,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -30747,20 +3616,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -30772,17 +3635,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -30808,93 +3674,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -30936,7 +3772,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -30959,7 +3795,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -30968,25 +3804,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -30996,8 +3838,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -31013,11 +3857,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -31119,14 +3964,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
index 2eb1e066ff..6b99d481e4 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 7; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 1890;
     //static const int ncomb = 128; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
index 523ef1948b..5058fe1f57 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
index 3152176aa0..37f74434ee 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc
new file mode 100644
index 0000000000..c027c38503
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.cc
@@ -0,0 +1,501 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
+
+  // The color matrix (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
+    { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
+    { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
+    { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
+    { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
+    { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
+    { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
+    { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
+    { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
+    { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
+    { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
+    { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
+    { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
+    { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
+    { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
+    { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
+    { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
+    { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
+    { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
+    { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
+    { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
+    { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
+    { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
+    { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
+    { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
+    { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
+    { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
+    { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
+    { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
+    { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
+    { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
+    { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
+    { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
+    { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
+    { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
+    { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
+    { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
+    { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
+    { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
+    { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
+    { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
+    { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
+    { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
+    { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
+    { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
+    { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
+    { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
+    { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
+    { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
+    { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
+    { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
+    { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
+    { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
+    { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
+    { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
+    { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
+    { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
+    { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
+    { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
+    { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
+    { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
+    { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
+    { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
+    { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
+    { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
+    { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
+    { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
+    { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
+    { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
+    { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
+    { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
+    { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
+    { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
+    { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
+    { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
+    { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
+    { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
+    { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
+    { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
+    { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
+    { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
+    { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
+    { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
+    { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
+    { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
+    { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
+    { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
+    { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
+    { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
+    { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
+    { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
+    { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
+    { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
+    { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
+    { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
+    { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
+    { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
+    { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
+    { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
+    { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
+    { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
+    { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
+    { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
+    { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
+    { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
+    { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
+    { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
+    { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
+    { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
+    { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
+    { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
+    { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
+    { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
+    { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
+    { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
+    { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
+    { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
+    { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
+    { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
+    { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagrams.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagrams.h
new file mode 100644
index 0000000000..1ff425b7f7
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/diagrams.h
@@ -0,0 +1,49496 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 1240 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 1
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 1240 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 2
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 1240 ***
+    // Wavefunction(s) for diagram number 3
+    // (none)
+    // Amplitude(s) for diagram number 3
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 1240 ***
+    // Wavefunction(s) for diagram number 4
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] );
+    // Amplitude(s) for diagram number 4
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 1240 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 1240 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 1240 ***
+    // Wavefunction(s) for diagram number 7
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 7
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 1240 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 1240 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 1240 ***
+    // Wavefunction(s) for diagram number 10
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
+    // Amplitude(s) for diagram number 10
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 1240 ***
+    // Wavefunction(s) for diagram number 11
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] );
+    // Amplitude(s) for diagram number 11
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 1240 ***
+    // Wavefunction(s) for diagram number 12
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 12
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 1240 ***
+    // Wavefunction(s) for diagram number 13
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 13
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 1240 ***
+    // Wavefunction(s) for diagram number 14
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] );
+    // Amplitude(s) for diagram number 14
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 1240 ***
+    // Wavefunction(s) for diagram number 15
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] );
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 1240 ***
+    // Wavefunction(s) for diagram number 16
+    // (none)
+    // Amplitude(s) for diagram number 16
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 1240 ***
+    // Wavefunction(s) for diagram number 17
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] );
+    // Amplitude(s) for diagram number 17
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 1240 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 1240 ***
+    // Wavefunction(s) for diagram number 19
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] );
+    // Amplitude(s) for diagram number 19
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 1240 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 1240 ***
+    // Wavefunction(s) for diagram number 21
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
+    // Amplitude(s) for diagram number 21
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 1240 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 1240 ***
+    // Wavefunction(s) for diagram number 23
+    // (none)
+    // Amplitude(s) for diagram number 23
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 1240 ***
+    // Wavefunction(s) for diagram number 24
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] );
+    // Amplitude(s) for diagram number 24
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 1240 ***
+    // Wavefunction(s) for diagram number 25
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] );
+    // Amplitude(s) for diagram number 25
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 1240 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 1240 ***
+    // Wavefunction(s) for diagram number 27
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 1240 ***
+    // Wavefunction(s) for diagram number 28
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] );
+    // Amplitude(s) for diagram number 28
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 1240 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 1240 ***
+    // Wavefunction(s) for diagram number 30
+    // (none)
+    // Amplitude(s) for diagram number 30
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 1240 ***
+    // Wavefunction(s) for diagram number 31
+    // (none)
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 1240 ***
+    // Wavefunction(s) for diagram number 32
+    // (none)
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 1240 ***
+    // Wavefunction(s) for diagram number 33
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 1240 ***
+    // Wavefunction(s) for diagram number 34
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
+    // Amplitude(s) for diagram number 34
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 1240 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 1240 ***
+    // Wavefunction(s) for diagram number 36
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] );
+    // Amplitude(s) for diagram number 36
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram37( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 37 OF 1240 ***
+    // Wavefunction(s) for diagram number 37
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] );
+    // Amplitude(s) for diagram number 37
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram38( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 38 OF 1240 ***
+    // Wavefunction(s) for diagram number 38
+    // (none)
+    // Amplitude(s) for diagram number 38
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram39( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 39 OF 1240 ***
+    // Wavefunction(s) for diagram number 39
+    // (none)
+    // Amplitude(s) for diagram number 39
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram40( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 40 OF 1240 ***
+    // Wavefunction(s) for diagram number 40
+    // (none)
+    // Amplitude(s) for diagram number 40
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram41( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 41 OF 1240 ***
+    // Wavefunction(s) for diagram number 41
+    // (none)
+    // Amplitude(s) for diagram number 41
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram42( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 42 OF 1240 ***
+    // Wavefunction(s) for diagram number 42
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] );
+    // Amplitude(s) for diagram number 42
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram43( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 43 OF 1240 ***
+    // Wavefunction(s) for diagram number 43
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] );
+    // Amplitude(s) for diagram number 43
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram44( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 44 OF 1240 ***
+    // Wavefunction(s) for diagram number 44
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] );
+    // Amplitude(s) for diagram number 44
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram45( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 45 OF 1240 ***
+    // Wavefunction(s) for diagram number 45
+    // (none)
+    // Amplitude(s) for diagram number 45
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram46( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 46 OF 1240 ***
+    // Wavefunction(s) for diagram number 46
+    // (none)
+    // Amplitude(s) for diagram number 46
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram47( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 47 OF 1240 ***
+    // Wavefunction(s) for diagram number 47
+    // (none)
+    // Amplitude(s) for diagram number 47
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram48( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 48 OF 1240 ***
+    // Wavefunction(s) for diagram number 48
+    // (none)
+    // Amplitude(s) for diagram number 48
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram49( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 49 OF 1240 ***
+    // Wavefunction(s) for diagram number 49
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
+    // Amplitude(s) for diagram number 49
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram50( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 50 OF 1240 ***
+    // Wavefunction(s) for diagram number 50
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
+    // Amplitude(s) for diagram number 50
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram51( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 51 OF 1240 ***
+    // Wavefunction(s) for diagram number 51
+    // (none)
+    // Amplitude(s) for diagram number 51
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram52( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 52 OF 1240 ***
+    // Wavefunction(s) for diagram number 52
+    // (none)
+    // Amplitude(s) for diagram number 52
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram53( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 53 OF 1240 ***
+    // Wavefunction(s) for diagram number 53
+    // (none)
+    // Amplitude(s) for diagram number 53
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram54( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 54 OF 1240 ***
+    // Wavefunction(s) for diagram number 54
+    // (none)
+    // Amplitude(s) for diagram number 54
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram55( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 55 OF 1240 ***
+    // Wavefunction(s) for diagram number 55
+    // (none)
+    // Amplitude(s) for diagram number 55
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram56( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 56 OF 1240 ***
+    // Wavefunction(s) for diagram number 56
+    // (none)
+    // Amplitude(s) for diagram number 56
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram57( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 57 OF 1240 ***
+    // Wavefunction(s) for diagram number 57
+    // (none)
+    // Amplitude(s) for diagram number 57
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram58( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 58 OF 1240 ***
+    // Wavefunction(s) for diagram number 58
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] );
+    // Amplitude(s) for diagram number 58
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram59( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 59 OF 1240 ***
+    // Wavefunction(s) for diagram number 59
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] );
+    // Amplitude(s) for diagram number 59
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram60( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 60 OF 1240 ***
+    // Wavefunction(s) for diagram number 60
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] );
+    // Amplitude(s) for diagram number 60
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram61( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 61 OF 1240 ***
+    // Wavefunction(s) for diagram number 61
+    // (none)
+    // Amplitude(s) for diagram number 61
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram62( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 62 OF 1240 ***
+    // Wavefunction(s) for diagram number 62
+    // (none)
+    // Amplitude(s) for diagram number 62
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram63( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 63 OF 1240 ***
+    // Wavefunction(s) for diagram number 63
+    // (none)
+    // Amplitude(s) for diagram number 63
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram64( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 64 OF 1240 ***
+    // Wavefunction(s) for diagram number 64
+    // (none)
+    // Amplitude(s) for diagram number 64
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram65( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 65 OF 1240 ***
+    // Wavefunction(s) for diagram number 65
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+    // Amplitude(s) for diagram number 65
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram66( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 66 OF 1240 ***
+    // Wavefunction(s) for diagram number 66
+    // (none)
+    // Amplitude(s) for diagram number 66
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram67( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 67 OF 1240 ***
+    // Wavefunction(s) for diagram number 67
+    // (none)
+    // Amplitude(s) for diagram number 67
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram68( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 68 OF 1240 ***
+    // Wavefunction(s) for diagram number 68
+    // (none)
+    // Amplitude(s) for diagram number 68
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram69( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 69 OF 1240 ***
+    // Wavefunction(s) for diagram number 69
+    // (none)
+    // Amplitude(s) for diagram number 69
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram70( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 70 OF 1240 ***
+    // Wavefunction(s) for diagram number 70
+    // (none)
+    // Amplitude(s) for diagram number 70
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram71( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 71 OF 1240 ***
+    // Wavefunction(s) for diagram number 71
+    // (none)
+    // Amplitude(s) for diagram number 71
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram72( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 72 OF 1240 ***
+    // Wavefunction(s) for diagram number 72
+    // (none)
+    // Amplitude(s) for diagram number 72
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram73( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 73 OF 1240 ***
+    // Wavefunction(s) for diagram number 73
+    // (none)
+    // Amplitude(s) for diagram number 73
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram74( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 74 OF 1240 ***
+    // Wavefunction(s) for diagram number 74
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
+    // Amplitude(s) for diagram number 74
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 74 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram75( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 75 OF 1240 ***
+    // Wavefunction(s) for diagram number 75
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] );
+    // Amplitude(s) for diagram number 75
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram76( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 76 OF 1240 ***
+    // Wavefunction(s) for diagram number 76
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] );
+    // Amplitude(s) for diagram number 76
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram77( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 77 OF 1240 ***
+    // Wavefunction(s) for diagram number 77
+    // (none)
+    // Amplitude(s) for diagram number 77
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram78( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 78 OF 1240 ***
+    // Wavefunction(s) for diagram number 78
+    // (none)
+    // Amplitude(s) for diagram number 78
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram79( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 79 OF 1240 ***
+    // Wavefunction(s) for diagram number 79
+    // (none)
+    // Amplitude(s) for diagram number 79
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram80( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 80 OF 1240 ***
+    // Wavefunction(s) for diagram number 80
+    // (none)
+    // Amplitude(s) for diagram number 80
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram81( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 81 OF 1240 ***
+    // Wavefunction(s) for diagram number 81
+    // (none)
+    // Amplitude(s) for diagram number 81
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram82( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 82 OF 1240 ***
+    // Wavefunction(s) for diagram number 82
+    // (none)
+    // Amplitude(s) for diagram number 82
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram83( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 83 OF 1240 ***
+    // Wavefunction(s) for diagram number 83
+    // (none)
+    // Amplitude(s) for diagram number 83
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram84( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 84 OF 1240 ***
+    // Wavefunction(s) for diagram number 84
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] );
+    // Amplitude(s) for diagram number 84
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram85( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 85 OF 1240 ***
+    // Wavefunction(s) for diagram number 85
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
+    // Amplitude(s) for diagram number 85
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram86( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 86 OF 1240 ***
+    // Wavefunction(s) for diagram number 86
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 86
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram87( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 87 OF 1240 ***
+    // Wavefunction(s) for diagram number 87
+    // (none)
+    // Amplitude(s) for diagram number 87
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram88( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 88 OF 1240 ***
+    // Wavefunction(s) for diagram number 88
+    // (none)
+    // Amplitude(s) for diagram number 88
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram89( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 89 OF 1240 ***
+    // Wavefunction(s) for diagram number 89
+    // (none)
+    // Amplitude(s) for diagram number 89
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram90( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 90 OF 1240 ***
+    // Wavefunction(s) for diagram number 90
+    // (none)
+    // Amplitude(s) for diagram number 90
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram91( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 91 OF 1240 ***
+    // Wavefunction(s) for diagram number 91
+    // (none)
+    // Amplitude(s) for diagram number 91
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram92( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 92 OF 1240 ***
+    // Wavefunction(s) for diagram number 92
+    // (none)
+    // Amplitude(s) for diagram number 92
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram93( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 93 OF 1240 ***
+    // Wavefunction(s) for diagram number 93
+    // (none)
+    // Amplitude(s) for diagram number 93
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 93 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram94( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 94 OF 1240 ***
+    // Wavefunction(s) for diagram number 94
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] );
+    // Amplitude(s) for diagram number 94
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram95( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 95 OF 1240 ***
+    // Wavefunction(s) for diagram number 95
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
+    // Amplitude(s) for diagram number 95
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram96( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 96 OF 1240 ***
+    // Wavefunction(s) for diagram number 96
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] );
+    // Amplitude(s) for diagram number 96
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram97( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 97 OF 1240 ***
+    // Wavefunction(s) for diagram number 97
+    // (none)
+    // Amplitude(s) for diagram number 97
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram98( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 98 OF 1240 ***
+    // Wavefunction(s) for diagram number 98
+    // (none)
+    // Amplitude(s) for diagram number 98
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram99( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 99 OF 1240 ***
+    // Wavefunction(s) for diagram number 99
+    // (none)
+    // Amplitude(s) for diagram number 99
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram100( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 100 OF 1240 ***
+    // Wavefunction(s) for diagram number 100
+    // (none)
+    // Amplitude(s) for diagram number 100
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram101( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 101 OF 1240 ***
+    // Wavefunction(s) for diagram number 101
+    // (none)
+    // Amplitude(s) for diagram number 101
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram102( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 102 OF 1240 ***
+    // Wavefunction(s) for diagram number 102
+    // (none)
+    // Amplitude(s) for diagram number 102
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram103( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 103 OF 1240 ***
+    // Wavefunction(s) for diagram number 103
+    // (none)
+    // Amplitude(s) for diagram number 103
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram104( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 104 OF 1240 ***
+    // Wavefunction(s) for diagram number 104
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] );
+    // Amplitude(s) for diagram number 104
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram105( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 105 OF 1240 ***
+    // Wavefunction(s) for diagram number 105
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] );
+    // Amplitude(s) for diagram number 105
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram106( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 106 OF 1240 ***
+    // Wavefunction(s) for diagram number 106
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    // Amplitude(s) for diagram number 106
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram107( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 107 OF 1240 ***
+    // Wavefunction(s) for diagram number 107
+    // (none)
+    // Amplitude(s) for diagram number 107
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 107 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram108( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 108 OF 1240 ***
+    // Wavefunction(s) for diagram number 108
+    // (none)
+    // Amplitude(s) for diagram number 108
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram109( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 109 OF 1240 ***
+    // Wavefunction(s) for diagram number 109
+    // (none)
+    // Amplitude(s) for diagram number 109
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram110( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 110 OF 1240 ***
+    // Wavefunction(s) for diagram number 110
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 110
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram111( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 111 OF 1240 ***
+    // Wavefunction(s) for diagram number 111
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 111
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram112( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 112 OF 1240 ***
+    // Wavefunction(s) for diagram number 112
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
+    // Amplitude(s) for diagram number 112
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram113( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 113 OF 1240 ***
+    // Wavefunction(s) for diagram number 113
+    // (none)
+    // Amplitude(s) for diagram number 113
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram114( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 114 OF 1240 ***
+    // Wavefunction(s) for diagram number 114
+    // (none)
+    // Amplitude(s) for diagram number 114
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 114 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram115( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 115 OF 1240 ***
+    // Wavefunction(s) for diagram number 115
+    // (none)
+    // Amplitude(s) for diagram number 115
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 115 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram116( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 116 OF 1240 ***
+    // Wavefunction(s) for diagram number 116
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 116
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 116 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram117( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 117 OF 1240 ***
+    // Wavefunction(s) for diagram number 117
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] );
+    // Amplitude(s) for diagram number 117
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 117 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram118( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 118 OF 1240 ***
+    // Wavefunction(s) for diagram number 118
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] );
+    // Amplitude(s) for diagram number 118
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 118 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram119( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 119 OF 1240 ***
+    // Wavefunction(s) for diagram number 119
+    // (none)
+    // Amplitude(s) for diagram number 119
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 119 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram120( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 120 OF 1240 ***
+    // Wavefunction(s) for diagram number 120
+    // (none)
+    // Amplitude(s) for diagram number 120
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 120 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram121( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 121 OF 1240 ***
+    // Wavefunction(s) for diagram number 121
+    // (none)
+    // Amplitude(s) for diagram number 121
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 121 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram122( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 122 OF 1240 ***
+    // Wavefunction(s) for diagram number 122
+    // (none)
+    // Amplitude(s) for diagram number 122
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram123( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 123 OF 1240 ***
+    // Wavefunction(s) for diagram number 123
+    // (none)
+    // Amplitude(s) for diagram number 123
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram124( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 124 OF 1240 ***
+    // Wavefunction(s) for diagram number 124
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 124
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 124 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram125( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 125 OF 1240 ***
+    // Wavefunction(s) for diagram number 125
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 125
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 125 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram126( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 126 OF 1240 ***
+    // Wavefunction(s) for diagram number 126
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] );
+    // Amplitude(s) for diagram number 126
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 126 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram127( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 127 OF 1240 ***
+    // Wavefunction(s) for diagram number 127
+    // (none)
+    // Amplitude(s) for diagram number 127
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 127 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram128( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 128 OF 1240 ***
+    // Wavefunction(s) for diagram number 128
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] );
+    // Amplitude(s) for diagram number 128
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 128 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram129( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 129 OF 1240 ***
+    // Wavefunction(s) for diagram number 129
+    // (none)
+    // Amplitude(s) for diagram number 129
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 129 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram130( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 130 OF 1240 ***
+    // Wavefunction(s) for diagram number 130
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] );
+    // Amplitude(s) for diagram number 130
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 130 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram131( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 131 OF 1240 ***
+    // Wavefunction(s) for diagram number 131
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+    // Amplitude(s) for diagram number 131
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 131 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram132( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 132 OF 1240 ***
+    // Wavefunction(s) for diagram number 132
+    // (none)
+    // Amplitude(s) for diagram number 132
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 132 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram133( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 133 OF 1240 ***
+    // Wavefunction(s) for diagram number 133
+    // (none)
+    // Amplitude(s) for diagram number 133
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 133 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram134( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 134 OF 1240 ***
+    // Wavefunction(s) for diagram number 134
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    // Amplitude(s) for diagram number 134
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 134 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram135( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 135 OF 1240 ***
+    // Wavefunction(s) for diagram number 135
+    // (none)
+    // Amplitude(s) for diagram number 135
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 135 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram136( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 136 OF 1240 ***
+    // Wavefunction(s) for diagram number 136
+    // (none)
+    // Amplitude(s) for diagram number 136
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 136 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram137( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 137 OF 1240 ***
+    // Wavefunction(s) for diagram number 137
+    // (none)
+    // Amplitude(s) for diagram number 137
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 137 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram138( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 138 OF 1240 ***
+    // Wavefunction(s) for diagram number 138
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
+    // Amplitude(s) for diagram number 138
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 138 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram139( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 139 OF 1240 ***
+    // Wavefunction(s) for diagram number 139
+    // (none)
+    // Amplitude(s) for diagram number 139
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram140( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 140 OF 1240 ***
+    // Wavefunction(s) for diagram number 140
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] );
+    // Amplitude(s) for diagram number 140
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 140 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram141( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 141 OF 1240 ***
+    // Wavefunction(s) for diagram number 141
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] );
+    // Amplitude(s) for diagram number 141
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 141 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram142( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 142 OF 1240 ***
+    // Wavefunction(s) for diagram number 142
+    // (none)
+    // Amplitude(s) for diagram number 142
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram143( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 143 OF 1240 ***
+    // Wavefunction(s) for diagram number 143
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] );
+    // Amplitude(s) for diagram number 143
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 143 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram144( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 144 OF 1240 ***
+    // Wavefunction(s) for diagram number 144
+    // (none)
+    // Amplitude(s) for diagram number 144
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 144 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram145( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 145 OF 1240 ***
+    // Wavefunction(s) for diagram number 145
+    // (none)
+    // Amplitude(s) for diagram number 145
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 145 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram146( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 146 OF 1240 ***
+    // Wavefunction(s) for diagram number 146
+    // (none)
+    // Amplitude(s) for diagram number 146
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 146 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram147( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 147 OF 1240 ***
+    // Wavefunction(s) for diagram number 147
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
+    // Amplitude(s) for diagram number 147
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 147 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram148( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 148 OF 1240 ***
+    // Wavefunction(s) for diagram number 148
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] );
+    // Amplitude(s) for diagram number 148
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 148 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram149( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 149 OF 1240 ***
+    // Wavefunction(s) for diagram number 149
+    // (none)
+    // Amplitude(s) for diagram number 149
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 149 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram150( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 150 OF 1240 ***
+    // Wavefunction(s) for diagram number 150
+    // (none)
+    // Amplitude(s) for diagram number 150
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 150 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram151( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 151 OF 1240 ***
+    // Wavefunction(s) for diagram number 151
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 151
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 151 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram152( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 152 OF 1240 ***
+    // Wavefunction(s) for diagram number 152
+    // (none)
+    // Amplitude(s) for diagram number 152
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 152 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram153( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 153 OF 1240 ***
+    // Wavefunction(s) for diagram number 153
+    // (none)
+    // Amplitude(s) for diagram number 153
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 153 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram154( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 154 OF 1240 ***
+    // Wavefunction(s) for diagram number 154
+    // (none)
+    // Amplitude(s) for diagram number 154
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 154 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram155( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 155 OF 1240 ***
+    // Wavefunction(s) for diagram number 155
+    // (none)
+    // Amplitude(s) for diagram number 155
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 155 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram156( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 156 OF 1240 ***
+    // Wavefunction(s) for diagram number 156
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] );
+    // Amplitude(s) for diagram number 156
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 156 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram157( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 157 OF 1240 ***
+    // Wavefunction(s) for diagram number 157
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] );
+    // Amplitude(s) for diagram number 157
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 157 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram158( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 158 OF 1240 ***
+    // Wavefunction(s) for diagram number 158
+    // (none)
+    // Amplitude(s) for diagram number 158
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram159( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 159 OF 1240 ***
+    // Wavefunction(s) for diagram number 159
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    // Amplitude(s) for diagram number 159
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 159 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram160( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 160 OF 1240 ***
+    // Wavefunction(s) for diagram number 160
+    // (none)
+    // Amplitude(s) for diagram number 160
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 160 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram161( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 161 OF 1240 ***
+    // Wavefunction(s) for diagram number 161
+    // (none)
+    // Amplitude(s) for diagram number 161
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 161 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram162( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 162 OF 1240 ***
+    // Wavefunction(s) for diagram number 162
+    // (none)
+    // Amplitude(s) for diagram number 162
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 162 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram163( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 163 OF 1240 ***
+    // Wavefunction(s) for diagram number 163
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
+    // Amplitude(s) for diagram number 163
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 163 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram164( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 164 OF 1240 ***
+    // Wavefunction(s) for diagram number 164
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] );
+    // Amplitude(s) for diagram number 164
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 164 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram165( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 165 OF 1240 ***
+    // Wavefunction(s) for diagram number 165
+    // (none)
+    // Amplitude(s) for diagram number 165
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 165 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram166( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 166 OF 1240 ***
+    // Wavefunction(s) for diagram number 166
+    // (none)
+    // Amplitude(s) for diagram number 166
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 166 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram167( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 167 OF 1240 ***
+    // Wavefunction(s) for diagram number 167
+    // (none)
+    // Amplitude(s) for diagram number 167
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 167 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram168( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 168 OF 1240 ***
+    // Wavefunction(s) for diagram number 168
+    // (none)
+    // Amplitude(s) for diagram number 168
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 168 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram169( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 169 OF 1240 ***
+    // Wavefunction(s) for diagram number 169
+    // (none)
+    // Amplitude(s) for diagram number 169
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 169 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram170( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 170 OF 1240 ***
+    // Wavefunction(s) for diagram number 170
+    // (none)
+    // Amplitude(s) for diagram number 170
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 170 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram171( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 171 OF 1240 ***
+    // Wavefunction(s) for diagram number 171
+    // (none)
+    // Amplitude(s) for diagram number 171
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 171 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram172( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 172 OF 1240 ***
+    // Wavefunction(s) for diagram number 172
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] );
+    // Amplitude(s) for diagram number 172
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 172 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram173( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 173 OF 1240 ***
+    // Wavefunction(s) for diagram number 173
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] );
+    // Amplitude(s) for diagram number 173
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 173 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram174( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 174 OF 1240 ***
+    // Wavefunction(s) for diagram number 174
+    // (none)
+    // Amplitude(s) for diagram number 174
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram175( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 175 OF 1240 ***
+    // Wavefunction(s) for diagram number 175
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] );
+    // Amplitude(s) for diagram number 175
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 175 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram176( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 176 OF 1240 ***
+    // Wavefunction(s) for diagram number 176
+    // (none)
+    // Amplitude(s) for diagram number 176
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 176 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram177( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 177 OF 1240 ***
+    // Wavefunction(s) for diagram number 177
+    // (none)
+    // Amplitude(s) for diagram number 177
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 177 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram178( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 178 OF 1240 ***
+    // Wavefunction(s) for diagram number 178
+    // (none)
+    // Amplitude(s) for diagram number 178
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 178 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram179( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 179 OF 1240 ***
+    // Wavefunction(s) for diagram number 179
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    // Amplitude(s) for diagram number 179
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 179 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram180( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 180 OF 1240 ***
+    // Wavefunction(s) for diagram number 180
+    // (none)
+    // Amplitude(s) for diagram number 180
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 180 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram181( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 181 OF 1240 ***
+    // Wavefunction(s) for diagram number 181
+    // (none)
+    // Amplitude(s) for diagram number 181
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 181 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram182( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 182 OF 1240 ***
+    // Wavefunction(s) for diagram number 182
+    // (none)
+    // Amplitude(s) for diagram number 182
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 182 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram183( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 183 OF 1240 ***
+    // Wavefunction(s) for diagram number 183
+    // (none)
+    // Amplitude(s) for diagram number 183
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 183 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram184( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 184 OF 1240 ***
+    // Wavefunction(s) for diagram number 184
+    // (none)
+    // Amplitude(s) for diagram number 184
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 184 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram185( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 185 OF 1240 ***
+    // Wavefunction(s) for diagram number 185
+    // (none)
+    // Amplitude(s) for diagram number 185
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 185 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram186( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 186 OF 1240 ***
+    // Wavefunction(s) for diagram number 186
+    // (none)
+    // Amplitude(s) for diagram number 186
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 186 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram187( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 187 OF 1240 ***
+    // Wavefunction(s) for diagram number 187
+    // (none)
+    // Amplitude(s) for diagram number 187
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 187 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram188( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 188 OF 1240 ***
+    // Wavefunction(s) for diagram number 188
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    // Amplitude(s) for diagram number 188
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 188 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram189( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 189 OF 1240 ***
+    // Wavefunction(s) for diagram number 189
+    // (none)
+    // Amplitude(s) for diagram number 189
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 189 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram190( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 190 OF 1240 ***
+    // Wavefunction(s) for diagram number 190
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] );
+    // Amplitude(s) for diagram number 190
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 190 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram191( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 191 OF 1240 ***
+    // Wavefunction(s) for diagram number 191
+    // (none)
+    // Amplitude(s) for diagram number 191
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 191 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram192( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 192 OF 1240 ***
+    // Wavefunction(s) for diagram number 192
+    // (none)
+    // Amplitude(s) for diagram number 192
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 192 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram193( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 193 OF 1240 ***
+    // Wavefunction(s) for diagram number 193
+    // (none)
+    // Amplitude(s) for diagram number 193
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 193 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram194( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 194 OF 1240 ***
+    // Wavefunction(s) for diagram number 194
+    // (none)
+    // Amplitude(s) for diagram number 194
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 194 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram195( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 195 OF 1240 ***
+    // Wavefunction(s) for diagram number 195
+    // (none)
+    // Amplitude(s) for diagram number 195
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 195 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram196( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 196 OF 1240 ***
+    // Wavefunction(s) for diagram number 196
+    // (none)
+    // Amplitude(s) for diagram number 196
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 196 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram197( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 197 OF 1240 ***
+    // Wavefunction(s) for diagram number 197
+    // (none)
+    // Amplitude(s) for diagram number 197
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 197 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram198( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 198 OF 1240 ***
+    // Wavefunction(s) for diagram number 198
+    // (none)
+    // Amplitude(s) for diagram number 198
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 198 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram199( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 199 OF 1240 ***
+    // Wavefunction(s) for diagram number 199
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
+    // Amplitude(s) for diagram number 199
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 199 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram200( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 200 OF 1240 ***
+    // Wavefunction(s) for diagram number 200
+    // (none)
+    // Amplitude(s) for diagram number 200
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 200 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram201( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 201 OF 1240 ***
+    // Wavefunction(s) for diagram number 201
+    // (none)
+    // Amplitude(s) for diagram number 201
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 201 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram202( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 202 OF 1240 ***
+    // Wavefunction(s) for diagram number 202
+    // (none)
+    // Amplitude(s) for diagram number 202
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 202 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram203( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 203 OF 1240 ***
+    // Wavefunction(s) for diagram number 203
+    // (none)
+    // Amplitude(s) for diagram number 203
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 203 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram204( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 204 OF 1240 ***
+    // Wavefunction(s) for diagram number 204
+    // (none)
+    // Amplitude(s) for diagram number 204
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 204 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram205( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 205 OF 1240 ***
+    // Wavefunction(s) for diagram number 205
+    // (none)
+    // Amplitude(s) for diagram number 205
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 205 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram206( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 206 OF 1240 ***
+    // Wavefunction(s) for diagram number 206
+    // (none)
+    // Amplitude(s) for diagram number 206
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 206 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram207( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 207 OF 1240 ***
+    // Wavefunction(s) for diagram number 207
+    // (none)
+    // Amplitude(s) for diagram number 207
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 207 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram208( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 208 OF 1240 ***
+    // Wavefunction(s) for diagram number 208
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    // Amplitude(s) for diagram number 208
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 208 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram209( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 209 OF 1240 ***
+    // Wavefunction(s) for diagram number 209
+    // (none)
+    // Amplitude(s) for diagram number 209
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 209 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram210( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 210 OF 1240 ***
+    // Wavefunction(s) for diagram number 210
+    // (none)
+    // Amplitude(s) for diagram number 210
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 210 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram211( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 211 OF 1240 ***
+    // Wavefunction(s) for diagram number 211
+    // (none)
+    // Amplitude(s) for diagram number 211
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 211 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram212( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 212 OF 1240 ***
+    // Wavefunction(s) for diagram number 212
+    // (none)
+    // Amplitude(s) for diagram number 212
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 212 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram213( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 213 OF 1240 ***
+    // Wavefunction(s) for diagram number 213
+    // (none)
+    // Amplitude(s) for diagram number 213
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 213 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram214( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 214 OF 1240 ***
+    // Wavefunction(s) for diagram number 214
+    // (none)
+    // Amplitude(s) for diagram number 214
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 214 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram215( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 215 OF 1240 ***
+    // Wavefunction(s) for diagram number 215
+    // (none)
+    // Amplitude(s) for diagram number 215
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 215 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram216( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 216 OF 1240 ***
+    // Wavefunction(s) for diagram number 216
+    // (none)
+    // Amplitude(s) for diagram number 216
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 216 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram217( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 217 OF 1240 ***
+    // Wavefunction(s) for diagram number 217
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] );
+    // Amplitude(s) for diagram number 217
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 217 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram218( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 218 OF 1240 ***
+    // Wavefunction(s) for diagram number 218
+    // (none)
+    // Amplitude(s) for diagram number 218
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 218 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram219( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 219 OF 1240 ***
+    // Wavefunction(s) for diagram number 219
+    // (none)
+    // Amplitude(s) for diagram number 219
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram220( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 220 OF 1240 ***
+    // Wavefunction(s) for diagram number 220
+    // (none)
+    // Amplitude(s) for diagram number 220
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 220 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram221( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 221 OF 1240 ***
+    // Wavefunction(s) for diagram number 221
+    // (none)
+    // Amplitude(s) for diagram number 221
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 221 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram222( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 222 OF 1240 ***
+    // Wavefunction(s) for diagram number 222
+    // (none)
+    // Amplitude(s) for diagram number 222
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 222 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram223( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 223 OF 1240 ***
+    // Wavefunction(s) for diagram number 223
+    // (none)
+    // Amplitude(s) for diagram number 223
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 223 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram224( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 224 OF 1240 ***
+    // Wavefunction(s) for diagram number 224
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 224
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 224 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram225( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 225 OF 1240 ***
+    // Wavefunction(s) for diagram number 225
+    // (none)
+    // Amplitude(s) for diagram number 225
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 225 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram226( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 226 OF 1240 ***
+    // Wavefunction(s) for diagram number 226
+    // (none)
+    // Amplitude(s) for diagram number 226
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram227( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 227 OF 1240 ***
+    // Wavefunction(s) for diagram number 227
+    // (none)
+    // Amplitude(s) for diagram number 227
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 227 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram228( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 228 OF 1240 ***
+    // Wavefunction(s) for diagram number 228
+    // (none)
+    // Amplitude(s) for diagram number 228
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 228 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram229( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 229 OF 1240 ***
+    // Wavefunction(s) for diagram number 229
+    // (none)
+    // Amplitude(s) for diagram number 229
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 229 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram230( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 230 OF 1240 ***
+    // Wavefunction(s) for diagram number 230
+    // (none)
+    // Amplitude(s) for diagram number 230
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 230 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram231( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 231 OF 1240 ***
+    // Wavefunction(s) for diagram number 231
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] );
+    // Amplitude(s) for diagram number 231
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 231 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram232( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 232 OF 1240 ***
+    // Wavefunction(s) for diagram number 232
+    // (none)
+    // Amplitude(s) for diagram number 232
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 232 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram233( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 233 OF 1240 ***
+    // Wavefunction(s) for diagram number 233
+    // (none)
+    // Amplitude(s) for diagram number 233
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram234( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 234 OF 1240 ***
+    // Wavefunction(s) for diagram number 234
+    // (none)
+    // Amplitude(s) for diagram number 234
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 234 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram235( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 235 OF 1240 ***
+    // Wavefunction(s) for diagram number 235
+    // (none)
+    // Amplitude(s) for diagram number 235
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 235 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram236( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 236 OF 1240 ***
+    // Wavefunction(s) for diagram number 236
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] );
+    // Amplitude(s) for diagram number 236
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram237( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 237 OF 1240 ***
+    // Wavefunction(s) for diagram number 237
+    // (none)
+    // Amplitude(s) for diagram number 237
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram238( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 238 OF 1240 ***
+    // Wavefunction(s) for diagram number 238
+    // (none)
+    // Amplitude(s) for diagram number 238
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram239( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 239 OF 1240 ***
+    // Wavefunction(s) for diagram number 239
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] );
+    // Amplitude(s) for diagram number 239
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram240( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 240 OF 1240 ***
+    // Wavefunction(s) for diagram number 240
+    // (none)
+    // Amplitude(s) for diagram number 240
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram241( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 241 OF 1240 ***
+    // Wavefunction(s) for diagram number 241
+    // (none)
+    // Amplitude(s) for diagram number 241
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram242( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 242 OF 1240 ***
+    // Wavefunction(s) for diagram number 242
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] );
+    // Amplitude(s) for diagram number 242
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram243( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 243 OF 1240 ***
+    // Wavefunction(s) for diagram number 243
+    // (none)
+    // Amplitude(s) for diagram number 243
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram244( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 244 OF 1240 ***
+    // Wavefunction(s) for diagram number 244
+    // (none)
+    // Amplitude(s) for diagram number 244
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram245( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 245 OF 1240 ***
+    // Wavefunction(s) for diagram number 245
+    // (none)
+    // Amplitude(s) for diagram number 245
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram246( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 246 OF 1240 ***
+    // Wavefunction(s) for diagram number 246
+    // (none)
+    // Amplitude(s) for diagram number 246
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram247( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 247 OF 1240 ***
+    // Wavefunction(s) for diagram number 247
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 247
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 247 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram248( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 248 OF 1240 ***
+    // Wavefunction(s) for diagram number 248
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] );
+    // Amplitude(s) for diagram number 248
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 248 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram249( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 249 OF 1240 ***
+    // Wavefunction(s) for diagram number 249
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] );
+    // Amplitude(s) for diagram number 249
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 249 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram250( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 250 OF 1240 ***
+    // Wavefunction(s) for diagram number 250
+    // (none)
+    // Amplitude(s) for diagram number 250
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 250 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram251( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 251 OF 1240 ***
+    // Wavefunction(s) for diagram number 251
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
+    // Amplitude(s) for diagram number 251
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 251 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram252( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 252 OF 1240 ***
+    // Wavefunction(s) for diagram number 252
+    // (none)
+    // Amplitude(s) for diagram number 252
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 252 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram253( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 253 OF 1240 ***
+    // Wavefunction(s) for diagram number 253
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] );
+    // Amplitude(s) for diagram number 253
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 253 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram254( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 254 OF 1240 ***
+    // Wavefunction(s) for diagram number 254
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
+    // Amplitude(s) for diagram number 254
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 254 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram255( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 255 OF 1240 ***
+    // Wavefunction(s) for diagram number 255
+    // (none)
+    // Amplitude(s) for diagram number 255
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 255 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram256( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 256 OF 1240 ***
+    // Wavefunction(s) for diagram number 256
+    // (none)
+    // Amplitude(s) for diagram number 256
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 256 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram257( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 257 OF 1240 ***
+    // Wavefunction(s) for diagram number 257
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
+    // Amplitude(s) for diagram number 257
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 257 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram258( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 258 OF 1240 ***
+    // Wavefunction(s) for diagram number 258
+    // (none)
+    // Amplitude(s) for diagram number 258
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 258 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram259( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 259 OF 1240 ***
+    // Wavefunction(s) for diagram number 259
+    // (none)
+    // Amplitude(s) for diagram number 259
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 259 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram260( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 260 OF 1240 ***
+    // Wavefunction(s) for diagram number 260
+    // (none)
+    // Amplitude(s) for diagram number 260
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 260 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram261( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 261 OF 1240 ***
+    // Wavefunction(s) for diagram number 261
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
+    // Amplitude(s) for diagram number 261
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 261 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram262( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 262 OF 1240 ***
+    // Wavefunction(s) for diagram number 262
+    // (none)
+    // Amplitude(s) for diagram number 262
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram263( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 263 OF 1240 ***
+    // Wavefunction(s) for diagram number 263
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] );
+    // Amplitude(s) for diagram number 263
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 263 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram264( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 264 OF 1240 ***
+    // Wavefunction(s) for diagram number 264
+    // (none)
+    // Amplitude(s) for diagram number 264
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 264 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram265( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 265 OF 1240 ***
+    // Wavefunction(s) for diagram number 265
+    // (none)
+    // Amplitude(s) for diagram number 265
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram266( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 266 OF 1240 ***
+    // Wavefunction(s) for diagram number 266
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] );
+    // Amplitude(s) for diagram number 266
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 266 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram267( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 267 OF 1240 ***
+    // Wavefunction(s) for diagram number 267
+    // (none)
+    // Amplitude(s) for diagram number 267
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 267 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram268( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 268 OF 1240 ***
+    // Wavefunction(s) for diagram number 268
+    // (none)
+    // Amplitude(s) for diagram number 268
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 268 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram269( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 269 OF 1240 ***
+    // Wavefunction(s) for diagram number 269
+    // (none)
+    // Amplitude(s) for diagram number 269
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 269 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram270( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 270 OF 1240 ***
+    // Wavefunction(s) for diagram number 270
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
+    // Amplitude(s) for diagram number 270
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 270 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram271( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 271 OF 1240 ***
+    // Wavefunction(s) for diagram number 271
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] );
+    // Amplitude(s) for diagram number 271
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 271 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram272( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 272 OF 1240 ***
+    // Wavefunction(s) for diagram number 272
+    // (none)
+    // Amplitude(s) for diagram number 272
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 272 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram273( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 273 OF 1240 ***
+    // Wavefunction(s) for diagram number 273
+    // (none)
+    // Amplitude(s) for diagram number 273
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 273 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram274( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 274 OF 1240 ***
+    // Wavefunction(s) for diagram number 274
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] );
+    // Amplitude(s) for diagram number 274
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 274 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram275( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 275 OF 1240 ***
+    // Wavefunction(s) for diagram number 275
+    // (none)
+    // Amplitude(s) for diagram number 275
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 275 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram276( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 276 OF 1240 ***
+    // Wavefunction(s) for diagram number 276
+    // (none)
+    // Amplitude(s) for diagram number 276
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 276 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram277( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 277 OF 1240 ***
+    // Wavefunction(s) for diagram number 277
+    // (none)
+    // Amplitude(s) for diagram number 277
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 277 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram278( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 278 OF 1240 ***
+    // Wavefunction(s) for diagram number 278
+    // (none)
+    // Amplitude(s) for diagram number 278
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 278 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram279( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 279 OF 1240 ***
+    // Wavefunction(s) for diagram number 279
+    // (none)
+    // Amplitude(s) for diagram number 279
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 279 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram280( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 280 OF 1240 ***
+    // Wavefunction(s) for diagram number 280
+    // (none)
+    // Amplitude(s) for diagram number 280
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 280 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram281( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 281 OF 1240 ***
+    // Wavefunction(s) for diagram number 281
+    // (none)
+    // Amplitude(s) for diagram number 281
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram282( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 282 OF 1240 ***
+    // Wavefunction(s) for diagram number 282
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
+    // Amplitude(s) for diagram number 282
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 282 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram283( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 283 OF 1240 ***
+    // Wavefunction(s) for diagram number 283
+    // (none)
+    // Amplitude(s) for diagram number 283
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 283 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram284( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 284 OF 1240 ***
+    // Wavefunction(s) for diagram number 284
+    // (none)
+    // Amplitude(s) for diagram number 284
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 284 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram285( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 285 OF 1240 ***
+    // Wavefunction(s) for diagram number 285
+    // (none)
+    // Amplitude(s) for diagram number 285
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 285 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram286( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 286 OF 1240 ***
+    // Wavefunction(s) for diagram number 286
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
+    // Amplitude(s) for diagram number 286
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 286 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram287( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 287 OF 1240 ***
+    // Wavefunction(s) for diagram number 287
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] );
+    // Amplitude(s) for diagram number 287
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 287 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram288( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 288 OF 1240 ***
+    // Wavefunction(s) for diagram number 288
+    // (none)
+    // Amplitude(s) for diagram number 288
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 288 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram289( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 289 OF 1240 ***
+    // Wavefunction(s) for diagram number 289
+    // (none)
+    // Amplitude(s) for diagram number 289
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 289 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram290( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 290 OF 1240 ***
+    // Wavefunction(s) for diagram number 290
+    // (none)
+    // Amplitude(s) for diagram number 290
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 290 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram291( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 291 OF 1240 ***
+    // Wavefunction(s) for diagram number 291
+    // (none)
+    // Amplitude(s) for diagram number 291
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 291 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram292( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 292 OF 1240 ***
+    // Wavefunction(s) for diagram number 292
+    // (none)
+    // Amplitude(s) for diagram number 292
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 292 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram293( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 293 OF 1240 ***
+    // Wavefunction(s) for diagram number 293
+    // (none)
+    // Amplitude(s) for diagram number 293
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 293 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram294( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 294 OF 1240 ***
+    // Wavefunction(s) for diagram number 294
+    // (none)
+    // Amplitude(s) for diagram number 294
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 294 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram295( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 295 OF 1240 ***
+    // Wavefunction(s) for diagram number 295
+    // (none)
+    // Amplitude(s) for diagram number 295
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 295 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram296( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 296 OF 1240 ***
+    // Wavefunction(s) for diagram number 296
+    // (none)
+    // Amplitude(s) for diagram number 296
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 296 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram297( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 297 OF 1240 ***
+    // Wavefunction(s) for diagram number 297
+    // (none)
+    // Amplitude(s) for diagram number 297
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram298( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 298 OF 1240 ***
+    // Wavefunction(s) for diagram number 298
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
+    // Amplitude(s) for diagram number 298
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 298 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram299( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 299 OF 1240 ***
+    // Wavefunction(s) for diagram number 299
+    // (none)
+    // Amplitude(s) for diagram number 299
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 299 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram300( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 300 OF 1240 ***
+    // Wavefunction(s) for diagram number 300
+    // (none)
+    // Amplitude(s) for diagram number 300
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 300 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram301( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 301 OF 1240 ***
+    // Wavefunction(s) for diagram number 301
+    // (none)
+    // Amplitude(s) for diagram number 301
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 301 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram302( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 302 OF 1240 ***
+    // Wavefunction(s) for diagram number 302
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 302
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 302 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram303( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 303 OF 1240 ***
+    // Wavefunction(s) for diagram number 303
+    // (none)
+    // Amplitude(s) for diagram number 303
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 303 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram304( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 304 OF 1240 ***
+    // Wavefunction(s) for diagram number 304
+    // (none)
+    // Amplitude(s) for diagram number 304
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 304 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram305( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 305 OF 1240 ***
+    // Wavefunction(s) for diagram number 305
+    // (none)
+    // Amplitude(s) for diagram number 305
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 305 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram306( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 306 OF 1240 ***
+    // Wavefunction(s) for diagram number 306
+    // (none)
+    // Amplitude(s) for diagram number 306
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 306 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram307( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 307 OF 1240 ***
+    // Wavefunction(s) for diagram number 307
+    // (none)
+    // Amplitude(s) for diagram number 307
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 307 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram308( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 308 OF 1240 ***
+    // Wavefunction(s) for diagram number 308
+    // (none)
+    // Amplitude(s) for diagram number 308
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 308 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram309( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 309 OF 1240 ***
+    // Wavefunction(s) for diagram number 309
+    // (none)
+    // Amplitude(s) for diagram number 309
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 309 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram310( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 310 OF 1240 ***
+    // Wavefunction(s) for diagram number 310
+    // (none)
+    // Amplitude(s) for diagram number 310
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 310 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram311( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 311 OF 1240 ***
+    // Wavefunction(s) for diagram number 311
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 311
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 311 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram312( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 312 OF 1240 ***
+    // Wavefunction(s) for diagram number 312
+    // (none)
+    // Amplitude(s) for diagram number 312
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 312 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram313( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 313 OF 1240 ***
+    // Wavefunction(s) for diagram number 313
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] );
+    // Amplitude(s) for diagram number 313
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 313 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram314( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 314 OF 1240 ***
+    // Wavefunction(s) for diagram number 314
+    // (none)
+    // Amplitude(s) for diagram number 314
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 314 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram315( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 315 OF 1240 ***
+    // Wavefunction(s) for diagram number 315
+    // (none)
+    // Amplitude(s) for diagram number 315
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 315 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram316( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 316 OF 1240 ***
+    // Wavefunction(s) for diagram number 316
+    // (none)
+    // Amplitude(s) for diagram number 316
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 316 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram317( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 317 OF 1240 ***
+    // Wavefunction(s) for diagram number 317
+    // (none)
+    // Amplitude(s) for diagram number 317
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 317 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram318( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 318 OF 1240 ***
+    // Wavefunction(s) for diagram number 318
+    // (none)
+    // Amplitude(s) for diagram number 318
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 318 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram319( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 319 OF 1240 ***
+    // Wavefunction(s) for diagram number 319
+    // (none)
+    // Amplitude(s) for diagram number 319
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 319 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram320( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 320 OF 1240 ***
+    // Wavefunction(s) for diagram number 320
+    // (none)
+    // Amplitude(s) for diagram number 320
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 320 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram321( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 321 OF 1240 ***
+    // Wavefunction(s) for diagram number 321
+    // (none)
+    // Amplitude(s) for diagram number 321
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 321 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram322( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 322 OF 1240 ***
+    // Wavefunction(s) for diagram number 322
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
+    // Amplitude(s) for diagram number 322
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 322 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram323( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 323 OF 1240 ***
+    // Wavefunction(s) for diagram number 323
+    // (none)
+    // Amplitude(s) for diagram number 323
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 323 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram324( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 324 OF 1240 ***
+    // Wavefunction(s) for diagram number 324
+    // (none)
+    // Amplitude(s) for diagram number 324
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 324 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram325( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 325 OF 1240 ***
+    // Wavefunction(s) for diagram number 325
+    // (none)
+    // Amplitude(s) for diagram number 325
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 325 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram326( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 326 OF 1240 ***
+    // Wavefunction(s) for diagram number 326
+    // (none)
+    // Amplitude(s) for diagram number 326
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 326 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram327( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 327 OF 1240 ***
+    // Wavefunction(s) for diagram number 327
+    // (none)
+    // Amplitude(s) for diagram number 327
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 327 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram328( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 328 OF 1240 ***
+    // Wavefunction(s) for diagram number 328
+    // (none)
+    // Amplitude(s) for diagram number 328
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 328 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram329( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 329 OF 1240 ***
+    // Wavefunction(s) for diagram number 329
+    // (none)
+    // Amplitude(s) for diagram number 329
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 329 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram330( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 330 OF 1240 ***
+    // Wavefunction(s) for diagram number 330
+    // (none)
+    // Amplitude(s) for diagram number 330
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 330 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram331( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 331 OF 1240 ***
+    // Wavefunction(s) for diagram number 331
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
+    // Amplitude(s) for diagram number 331
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 331 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram332( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 332 OF 1240 ***
+    // Wavefunction(s) for diagram number 332
+    // (none)
+    // Amplitude(s) for diagram number 332
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 332 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram333( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 333 OF 1240 ***
+    // Wavefunction(s) for diagram number 333
+    // (none)
+    // Amplitude(s) for diagram number 333
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 333 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram334( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 334 OF 1240 ***
+    // Wavefunction(s) for diagram number 334
+    // (none)
+    // Amplitude(s) for diagram number 334
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 334 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram335( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 335 OF 1240 ***
+    // Wavefunction(s) for diagram number 335
+    // (none)
+    // Amplitude(s) for diagram number 335
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 335 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram336( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 336 OF 1240 ***
+    // Wavefunction(s) for diagram number 336
+    // (none)
+    // Amplitude(s) for diagram number 336
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 336 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram337( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 337 OF 1240 ***
+    // Wavefunction(s) for diagram number 337
+    // (none)
+    // Amplitude(s) for diagram number 337
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 337 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram338( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 338 OF 1240 ***
+    // Wavefunction(s) for diagram number 338
+    // (none)
+    // Amplitude(s) for diagram number 338
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 338 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram339( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 339 OF 1240 ***
+    // Wavefunction(s) for diagram number 339
+    // (none)
+    // Amplitude(s) for diagram number 339
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 339 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram340( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 340 OF 1240 ***
+    // Wavefunction(s) for diagram number 340
+    // (none)
+    // Amplitude(s) for diagram number 340
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 340 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram341( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 341 OF 1240 ***
+    // Wavefunction(s) for diagram number 341
+    // (none)
+    // Amplitude(s) for diagram number 341
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 341 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram342( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 342 OF 1240 ***
+    // Wavefunction(s) for diagram number 342
+    // (none)
+    // Amplitude(s) for diagram number 342
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram343( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 343 OF 1240 ***
+    // Wavefunction(s) for diagram number 343
+    // (none)
+    // Amplitude(s) for diagram number 343
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 343 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram344( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 344 OF 1240 ***
+    // Wavefunction(s) for diagram number 344
+    // (none)
+    // Amplitude(s) for diagram number 344
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 344 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram345( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 345 OF 1240 ***
+    // Wavefunction(s) for diagram number 345
+    // (none)
+    // Amplitude(s) for diagram number 345
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 345 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram346( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 346 OF 1240 ***
+    // Wavefunction(s) for diagram number 346
+    // (none)
+    // Amplitude(s) for diagram number 346
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 346 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram347( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 347 OF 1240 ***
+    // Wavefunction(s) for diagram number 347
+    // (none)
+    // Amplitude(s) for diagram number 347
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 347 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram348( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 348 OF 1240 ***
+    // Wavefunction(s) for diagram number 348
+    // (none)
+    // Amplitude(s) for diagram number 348
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 348 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram349( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 349 OF 1240 ***
+    // Wavefunction(s) for diagram number 349
+    // (none)
+    // Amplitude(s) for diagram number 349
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram350( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 350 OF 1240 ***
+    // Wavefunction(s) for diagram number 350
+    // (none)
+    // Amplitude(s) for diagram number 350
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 350 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram351( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 351 OF 1240 ***
+    // Wavefunction(s) for diagram number 351
+    // (none)
+    // Amplitude(s) for diagram number 351
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 351 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram352( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 352 OF 1240 ***
+    // Wavefunction(s) for diagram number 352
+    // (none)
+    // Amplitude(s) for diagram number 352
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 352 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram353( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 353 OF 1240 ***
+    // Wavefunction(s) for diagram number 353
+    // (none)
+    // Amplitude(s) for diagram number 353
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 353 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram354( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 354 OF 1240 ***
+    // Wavefunction(s) for diagram number 354
+    // (none)
+    // Amplitude(s) for diagram number 354
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 354 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram355( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 355 OF 1240 ***
+    // Wavefunction(s) for diagram number 355
+    // (none)
+    // Amplitude(s) for diagram number 355
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 355 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram356( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 356 OF 1240 ***
+    // Wavefunction(s) for diagram number 356
+    // (none)
+    // Amplitude(s) for diagram number 356
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram357( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 357 OF 1240 ***
+    // Wavefunction(s) for diagram number 357
+    // (none)
+    // Amplitude(s) for diagram number 357
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 357 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram358( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 358 OF 1240 ***
+    // Wavefunction(s) for diagram number 358
+    // (none)
+    // Amplitude(s) for diagram number 358
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 358 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram359( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 359 OF 1240 ***
+    // Wavefunction(s) for diagram number 359
+    // (none)
+    // Amplitude(s) for diagram number 359
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram360( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 360 OF 1240 ***
+    // Wavefunction(s) for diagram number 360
+    // (none)
+    // Amplitude(s) for diagram number 360
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram361( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 361 OF 1240 ***
+    // Wavefunction(s) for diagram number 361
+    // (none)
+    // Amplitude(s) for diagram number 361
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram362( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 362 OF 1240 ***
+    // Wavefunction(s) for diagram number 362
+    // (none)
+    // Amplitude(s) for diagram number 362
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram363( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 363 OF 1240 ***
+    // Wavefunction(s) for diagram number 363
+    // (none)
+    // Amplitude(s) for diagram number 363
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram364( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 364 OF 1240 ***
+    // Wavefunction(s) for diagram number 364
+    // (none)
+    // Amplitude(s) for diagram number 364
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram365( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 365 OF 1240 ***
+    // Wavefunction(s) for diagram number 365
+    // (none)
+    // Amplitude(s) for diagram number 365
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram366( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 366 OF 1240 ***
+    // Wavefunction(s) for diagram number 366
+    // (none)
+    // Amplitude(s) for diagram number 366
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram367( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 367 OF 1240 ***
+    // Wavefunction(s) for diagram number 367
+    // (none)
+    // Amplitude(s) for diagram number 367
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram368( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 368 OF 1240 ***
+    // Wavefunction(s) for diagram number 368
+    // (none)
+    // Amplitude(s) for diagram number 368
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram369( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 369 OF 1240 ***
+    // Wavefunction(s) for diagram number 369
+    // (none)
+    // Amplitude(s) for diagram number 369
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram370( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 370 OF 1240 ***
+    // Wavefunction(s) for diagram number 370
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 370
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 370 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram371( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 371 OF 1240 ***
+    // Wavefunction(s) for diagram number 371
+    // (none)
+    // Amplitude(s) for diagram number 371
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 371 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram372( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 372 OF 1240 ***
+    // Wavefunction(s) for diagram number 372
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] );
+    // Amplitude(s) for diagram number 372
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 372 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram373( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 373 OF 1240 ***
+    // Wavefunction(s) for diagram number 373
+    // (none)
+    // Amplitude(s) for diagram number 373
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 373 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram374( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 374 OF 1240 ***
+    // Wavefunction(s) for diagram number 374
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 374
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 374 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram375( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 375 OF 1240 ***
+    // Wavefunction(s) for diagram number 375
+    // (none)
+    // Amplitude(s) for diagram number 375
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 375 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram376( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 376 OF 1240 ***
+    // Wavefunction(s) for diagram number 376
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
+    // Amplitude(s) for diagram number 376
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram377( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 377 OF 1240 ***
+    // Wavefunction(s) for diagram number 377
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] );
+    // Amplitude(s) for diagram number 377
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 377 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram378( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 378 OF 1240 ***
+    // Wavefunction(s) for diagram number 378
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 378
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 378 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram379( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 379 OF 1240 ***
+    // Wavefunction(s) for diagram number 379
+    // (none)
+    // Amplitude(s) for diagram number 379
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 379 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram380( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 380 OF 1240 ***
+    // Wavefunction(s) for diagram number 380
+    // (none)
+    // Amplitude(s) for diagram number 380
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 380 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram381( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 381 OF 1240 ***
+    // Wavefunction(s) for diagram number 381
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] );
+    // Amplitude(s) for diagram number 381
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 381 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram382( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 382 OF 1240 ***
+    // Wavefunction(s) for diagram number 382
+    // (none)
+    // Amplitude(s) for diagram number 382
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 382 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram383( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 383 OF 1240 ***
+    // Wavefunction(s) for diagram number 383
+    // (none)
+    // Amplitude(s) for diagram number 383
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 383 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram384( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 384 OF 1240 ***
+    // Wavefunction(s) for diagram number 384
+    // (none)
+    // Amplitude(s) for diagram number 384
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 384 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram385( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 385 OF 1240 ***
+    // Wavefunction(s) for diagram number 385
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] );
+    // Amplitude(s) for diagram number 385
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 385 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram386( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 386 OF 1240 ***
+    // Wavefunction(s) for diagram number 386
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
+    // Amplitude(s) for diagram number 386
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 386 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram387( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 387 OF 1240 ***
+    // Wavefunction(s) for diagram number 387
+    // (none)
+    // Amplitude(s) for diagram number 387
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 387 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram388( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 388 OF 1240 ***
+    // Wavefunction(s) for diagram number 388
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] );
+    // Amplitude(s) for diagram number 388
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 388 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram389( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 389 OF 1240 ***
+    // Wavefunction(s) for diagram number 389
+    // (none)
+    // Amplitude(s) for diagram number 389
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 389 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram390( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 390 OF 1240 ***
+    // Wavefunction(s) for diagram number 390
+    // (none)
+    // Amplitude(s) for diagram number 390
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 390 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram391( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 391 OF 1240 ***
+    // Wavefunction(s) for diagram number 391
+    // (none)
+    // Amplitude(s) for diagram number 391
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 391 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram392( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 392 OF 1240 ***
+    // Wavefunction(s) for diagram number 392
+    // (none)
+    // Amplitude(s) for diagram number 392
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram393( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 393 OF 1240 ***
+    // Wavefunction(s) for diagram number 393
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
+    // Amplitude(s) for diagram number 393
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 393 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram394( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 394 OF 1240 ***
+    // Wavefunction(s) for diagram number 394
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] );
+    // Amplitude(s) for diagram number 394
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 394 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram395( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 395 OF 1240 ***
+    // Wavefunction(s) for diagram number 395
+    // (none)
+    // Amplitude(s) for diagram number 395
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 395 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram396( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 396 OF 1240 ***
+    // Wavefunction(s) for diagram number 396
+    // (none)
+    // Amplitude(s) for diagram number 396
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 396 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram397( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 397 OF 1240 ***
+    // Wavefunction(s) for diagram number 397
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
+    // Amplitude(s) for diagram number 397
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 397 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram398( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 398 OF 1240 ***
+    // Wavefunction(s) for diagram number 398
+    // (none)
+    // Amplitude(s) for diagram number 398
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 398 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram399( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 399 OF 1240 ***
+    // Wavefunction(s) for diagram number 399
+    // (none)
+    // Amplitude(s) for diagram number 399
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 399 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram400( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 400 OF 1240 ***
+    // Wavefunction(s) for diagram number 400
+    // (none)
+    // Amplitude(s) for diagram number 400
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 400 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram401( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 401 OF 1240 ***
+    // Wavefunction(s) for diagram number 401
+    // (none)
+    // Amplitude(s) for diagram number 401
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 401 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram402( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 402 OF 1240 ***
+    // Wavefunction(s) for diagram number 402
+    // (none)
+    // Amplitude(s) for diagram number 402
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 402 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram403( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 403 OF 1240 ***
+    // Wavefunction(s) for diagram number 403
+    // (none)
+    // Amplitude(s) for diagram number 403
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 403 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram404( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 404 OF 1240 ***
+    // Wavefunction(s) for diagram number 404
+    // (none)
+    // Amplitude(s) for diagram number 404
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 404 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram405( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 405 OF 1240 ***
+    // Wavefunction(s) for diagram number 405
+    // (none)
+    // Amplitude(s) for diagram number 405
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 405 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram406( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 406 OF 1240 ***
+    // Wavefunction(s) for diagram number 406
+    // (none)
+    // Amplitude(s) for diagram number 406
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 406 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram407( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 407 OF 1240 ***
+    // Wavefunction(s) for diagram number 407
+    // (none)
+    // Amplitude(s) for diagram number 407
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 407 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram408( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 408 OF 1240 ***
+    // Wavefunction(s) for diagram number 408
+    // (none)
+    // Amplitude(s) for diagram number 408
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram409( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 409 OF 1240 ***
+    // Wavefunction(s) for diagram number 409
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 409
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 409 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram410( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 410 OF 1240 ***
+    // Wavefunction(s) for diagram number 410
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] );
+    // Amplitude(s) for diagram number 410
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 410 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram411( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 411 OF 1240 ***
+    // Wavefunction(s) for diagram number 411
+    // (none)
+    // Amplitude(s) for diagram number 411
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 411 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram412( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 412 OF 1240 ***
+    // Wavefunction(s) for diagram number 412
+    // (none)
+    // Amplitude(s) for diagram number 412
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 412 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram413( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 413 OF 1240 ***
+    // Wavefunction(s) for diagram number 413
+    // (none)
+    // Amplitude(s) for diagram number 413
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 413 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram414( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 414 OF 1240 ***
+    // Wavefunction(s) for diagram number 414
+    // (none)
+    // Amplitude(s) for diagram number 414
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 414 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram415( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 415 OF 1240 ***
+    // Wavefunction(s) for diagram number 415
+    // (none)
+    // Amplitude(s) for diagram number 415
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 415 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram416( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 416 OF 1240 ***
+    // Wavefunction(s) for diagram number 416
+    // (none)
+    // Amplitude(s) for diagram number 416
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 416 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram417( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 417 OF 1240 ***
+    // Wavefunction(s) for diagram number 417
+    // (none)
+    // Amplitude(s) for diagram number 417
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 417 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram418( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 418 OF 1240 ***
+    // Wavefunction(s) for diagram number 418
+    // (none)
+    // Amplitude(s) for diagram number 418
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 418 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram419( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 419 OF 1240 ***
+    // Wavefunction(s) for diagram number 419
+    // (none)
+    // Amplitude(s) for diagram number 419
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 419 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram420( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 420 OF 1240 ***
+    // Wavefunction(s) for diagram number 420
+    // (none)
+    // Amplitude(s) for diagram number 420
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 420 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram421( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 421 OF 1240 ***
+    // Wavefunction(s) for diagram number 421
+    // (none)
+    // Amplitude(s) for diagram number 421
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 421 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram422( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 422 OF 1240 ***
+    // Wavefunction(s) for diagram number 422
+    // (none)
+    // Amplitude(s) for diagram number 422
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 422 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram423( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 423 OF 1240 ***
+    // Wavefunction(s) for diagram number 423
+    // (none)
+    // Amplitude(s) for diagram number 423
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 423 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram424( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 424 OF 1240 ***
+    // Wavefunction(s) for diagram number 424
+    // (none)
+    // Amplitude(s) for diagram number 424
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram425( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 425 OF 1240 ***
+    // Wavefunction(s) for diagram number 425
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 425
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 425 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram426( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 426 OF 1240 ***
+    // Wavefunction(s) for diagram number 426
+    // (none)
+    // Amplitude(s) for diagram number 426
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 426 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram427( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 427 OF 1240 ***
+    // Wavefunction(s) for diagram number 427
+    // (none)
+    // Amplitude(s) for diagram number 427
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 427 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram428( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 428 OF 1240 ***
+    // Wavefunction(s) for diagram number 428
+    // (none)
+    // Amplitude(s) for diagram number 428
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 428 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram429( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 429 OF 1240 ***
+    // Wavefunction(s) for diagram number 429
+    // (none)
+    // Amplitude(s) for diagram number 429
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 429 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram430( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 430 OF 1240 ***
+    // Wavefunction(s) for diagram number 430
+    // (none)
+    // Amplitude(s) for diagram number 430
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 430 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram431( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 431 OF 1240 ***
+    // Wavefunction(s) for diagram number 431
+    // (none)
+    // Amplitude(s) for diagram number 431
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 431 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram432( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 432 OF 1240 ***
+    // Wavefunction(s) for diagram number 432
+    // (none)
+    // Amplitude(s) for diagram number 432
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 432 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram433( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 433 OF 1240 ***
+    // Wavefunction(s) for diagram number 433
+    // (none)
+    // Amplitude(s) for diagram number 433
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 433 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram434( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 434 OF 1240 ***
+    // Wavefunction(s) for diagram number 434
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 434
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 434 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram435( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 435 OF 1240 ***
+    // Wavefunction(s) for diagram number 435
+    // (none)
+    // Amplitude(s) for diagram number 435
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 435 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram436( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 436 OF 1240 ***
+    // Wavefunction(s) for diagram number 436
+    // (none)
+    // Amplitude(s) for diagram number 436
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram437( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 437 OF 1240 ***
+    // Wavefunction(s) for diagram number 437
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] );
+    // Amplitude(s) for diagram number 437
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 437 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram438( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 438 OF 1240 ***
+    // Wavefunction(s) for diagram number 438
+    // (none)
+    // Amplitude(s) for diagram number 438
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 438 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram439( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 439 OF 1240 ***
+    // Wavefunction(s) for diagram number 439
+    // (none)
+    // Amplitude(s) for diagram number 439
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram440( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 440 OF 1240 ***
+    // Wavefunction(s) for diagram number 440
+    // (none)
+    // Amplitude(s) for diagram number 440
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 440 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram441( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 441 OF 1240 ***
+    // Wavefunction(s) for diagram number 441
+    // (none)
+    // Amplitude(s) for diagram number 441
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 441 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram442( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 442 OF 1240 ***
+    // Wavefunction(s) for diagram number 442
+    // (none)
+    // Amplitude(s) for diagram number 442
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram443( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 443 OF 1240 ***
+    // Wavefunction(s) for diagram number 443
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 443
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram444( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 444 OF 1240 ***
+    // Wavefunction(s) for diagram number 444
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] );
+    // Amplitude(s) for diagram number 444
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram445( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 445 OF 1240 ***
+    // Wavefunction(s) for diagram number 445
+    // (none)
+    // Amplitude(s) for diagram number 445
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram446( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 446 OF 1240 ***
+    // Wavefunction(s) for diagram number 446
+    // (none)
+    // Amplitude(s) for diagram number 446
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram447( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 447 OF 1240 ***
+    // Wavefunction(s) for diagram number 447
+    // (none)
+    // Amplitude(s) for diagram number 447
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 447 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram448( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 448 OF 1240 ***
+    // Wavefunction(s) for diagram number 448
+    // (none)
+    // Amplitude(s) for diagram number 448
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 448 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram449( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 449 OF 1240 ***
+    // Wavefunction(s) for diagram number 449
+    // (none)
+    // Amplitude(s) for diagram number 449
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 449 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram450( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 450 OF 1240 ***
+    // Wavefunction(s) for diagram number 450
+    // (none)
+    // Amplitude(s) for diagram number 450
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 450 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram451( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 451 OF 1240 ***
+    // Wavefunction(s) for diagram number 451
+    // (none)
+    // Amplitude(s) for diagram number 451
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 451 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram452( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 452 OF 1240 ***
+    // Wavefunction(s) for diagram number 452
+    // (none)
+    // Amplitude(s) for diagram number 452
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 452 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram453( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 453 OF 1240 ***
+    // Wavefunction(s) for diagram number 453
+    // (none)
+    // Amplitude(s) for diagram number 453
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 453 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram454( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 454 OF 1240 ***
+    // Wavefunction(s) for diagram number 454
+    // (none)
+    // Amplitude(s) for diagram number 454
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 454 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram455( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 455 OF 1240 ***
+    // Wavefunction(s) for diagram number 455
+    // (none)
+    // Amplitude(s) for diagram number 455
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 455 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram456( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 456 OF 1240 ***
+    // Wavefunction(s) for diagram number 456
+    // (none)
+    // Amplitude(s) for diagram number 456
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram457( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 457 OF 1240 ***
+    // Wavefunction(s) for diagram number 457
+    // (none)
+    // Amplitude(s) for diagram number 457
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 457 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram458( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 458 OF 1240 ***
+    // Wavefunction(s) for diagram number 458
+    // (none)
+    // Amplitude(s) for diagram number 458
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 458 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram459( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 459 OF 1240 ***
+    // Wavefunction(s) for diagram number 459
+    // (none)
+    // Amplitude(s) for diagram number 459
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 459 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram460( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 460 OF 1240 ***
+    // Wavefunction(s) for diagram number 460
+    // (none)
+    // Amplitude(s) for diagram number 460
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 460 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram461( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 461 OF 1240 ***
+    // Wavefunction(s) for diagram number 461
+    // (none)
+    // Amplitude(s) for diagram number 461
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 461 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram462( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 462 OF 1240 ***
+    // Wavefunction(s) for diagram number 462
+    // (none)
+    // Amplitude(s) for diagram number 462
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 462 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram463( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 463 OF 1240 ***
+    // Wavefunction(s) for diagram number 463
+    // (none)
+    // Amplitude(s) for diagram number 463
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 463 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram464( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 464 OF 1240 ***
+    // Wavefunction(s) for diagram number 464
+    // (none)
+    // Amplitude(s) for diagram number 464
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 464 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram465( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 465 OF 1240 ***
+    // Wavefunction(s) for diagram number 465
+    // (none)
+    // Amplitude(s) for diagram number 465
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 465 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram466( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 466 OF 1240 ***
+    // Wavefunction(s) for diagram number 466
+    // (none)
+    // Amplitude(s) for diagram number 466
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram467( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 467 OF 1240 ***
+    // Wavefunction(s) for diagram number 467
+    // (none)
+    // Amplitude(s) for diagram number 467
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 467 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram468( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 468 OF 1240 ***
+    // Wavefunction(s) for diagram number 468
+    // (none)
+    // Amplitude(s) for diagram number 468
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 468 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram469( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 469 OF 1240 ***
+    // Wavefunction(s) for diagram number 469
+    // (none)
+    // Amplitude(s) for diagram number 469
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 469 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram470( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 470 OF 1240 ***
+    // Wavefunction(s) for diagram number 470
+    // (none)
+    // Amplitude(s) for diagram number 470
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 470 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram471( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 471 OF 1240 ***
+    // Wavefunction(s) for diagram number 471
+    // (none)
+    // Amplitude(s) for diagram number 471
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 471 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram472( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 472 OF 1240 ***
+    // Wavefunction(s) for diagram number 472
+    // (none)
+    // Amplitude(s) for diagram number 472
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 472 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram473( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 473 OF 1240 ***
+    // Wavefunction(s) for diagram number 473
+    // (none)
+    // Amplitude(s) for diagram number 473
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 473 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram474( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 474 OF 1240 ***
+    // Wavefunction(s) for diagram number 474
+    // (none)
+    // Amplitude(s) for diagram number 474
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 474 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram475( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 475 OF 1240 ***
+    // Wavefunction(s) for diagram number 475
+    // (none)
+    // Amplitude(s) for diagram number 475
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 475 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram476( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 476 OF 1240 ***
+    // Wavefunction(s) for diagram number 476
+    // (none)
+    // Amplitude(s) for diagram number 476
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram477( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 477 OF 1240 ***
+    // Wavefunction(s) for diagram number 477
+    // (none)
+    // Amplitude(s) for diagram number 477
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 477 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram478( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 478 OF 1240 ***
+    // Wavefunction(s) for diagram number 478
+    // (none)
+    // Amplitude(s) for diagram number 478
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 478 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram479( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 479 OF 1240 ***
+    // Wavefunction(s) for diagram number 479
+    // (none)
+    // Amplitude(s) for diagram number 479
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 479 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram480( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 480 OF 1240 ***
+    // Wavefunction(s) for diagram number 480
+    // (none)
+    // Amplitude(s) for diagram number 480
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 480 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram481( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 481 OF 1240 ***
+    // Wavefunction(s) for diagram number 481
+    // (none)
+    // Amplitude(s) for diagram number 481
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 481 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram482( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 482 OF 1240 ***
+    // Wavefunction(s) for diagram number 482
+    // (none)
+    // Amplitude(s) for diagram number 482
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 482 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram483( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 483 OF 1240 ***
+    // Wavefunction(s) for diagram number 483
+    // (none)
+    // Amplitude(s) for diagram number 483
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram484( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 484 OF 1240 ***
+    // Wavefunction(s) for diagram number 484
+    // (none)
+    // Amplitude(s) for diagram number 484
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 484 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram485( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 485 OF 1240 ***
+    // Wavefunction(s) for diagram number 485
+    // (none)
+    // Amplitude(s) for diagram number 485
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 485 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram486( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 486 OF 1240 ***
+    // Wavefunction(s) for diagram number 486
+    // (none)
+    // Amplitude(s) for diagram number 486
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 486 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram487( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 487 OF 1240 ***
+    // Wavefunction(s) for diagram number 487
+    // (none)
+    // Amplitude(s) for diagram number 487
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 487 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram488( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 488 OF 1240 ***
+    // Wavefunction(s) for diagram number 488
+    // (none)
+    // Amplitude(s) for diagram number 488
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 488 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram489( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 489 OF 1240 ***
+    // Wavefunction(s) for diagram number 489
+    // (none)
+    // Amplitude(s) for diagram number 489
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 489 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram490( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 490 OF 1240 ***
+    // Wavefunction(s) for diagram number 490
+    // (none)
+    // Amplitude(s) for diagram number 490
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram491( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 491 OF 1240 ***
+    // Wavefunction(s) for diagram number 491
+    // (none)
+    // Amplitude(s) for diagram number 491
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram492( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 492 OF 1240 ***
+    // Wavefunction(s) for diagram number 492
+    // (none)
+    // Amplitude(s) for diagram number 492
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram493( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 493 OF 1240 ***
+    // Wavefunction(s) for diagram number 493
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 493
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 493 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram494( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 494 OF 1240 ***
+    // Wavefunction(s) for diagram number 494
+    // (none)
+    // Amplitude(s) for diagram number 494
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 494 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram495( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 495 OF 1240 ***
+    // Wavefunction(s) for diagram number 495
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] );
+    // Amplitude(s) for diagram number 495
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 495 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram496( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 496 OF 1240 ***
+    // Wavefunction(s) for diagram number 496
+    // (none)
+    // Amplitude(s) for diagram number 496
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 496 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram497( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 497 OF 1240 ***
+    // Wavefunction(s) for diagram number 497
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 497
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 497 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram498( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 498 OF 1240 ***
+    // Wavefunction(s) for diagram number 498
+    // (none)
+    // Amplitude(s) for diagram number 498
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 498 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram499( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 499 OF 1240 ***
+    // Wavefunction(s) for diagram number 499
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    // Amplitude(s) for diagram number 499
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram500( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 500 OF 1240 ***
+    // Wavefunction(s) for diagram number 500
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+    // Amplitude(s) for diagram number 500
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 500 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram501( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 501 OF 1240 ***
+    // Wavefunction(s) for diagram number 501
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
+    // Amplitude(s) for diagram number 501
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 501 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram502( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 502 OF 1240 ***
+    // Wavefunction(s) for diagram number 502
+    // (none)
+    // Amplitude(s) for diagram number 502
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 502 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram503( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 503 OF 1240 ***
+    // Wavefunction(s) for diagram number 503
+    // (none)
+    // Amplitude(s) for diagram number 503
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 503 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram504( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 504 OF 1240 ***
+    // Wavefunction(s) for diagram number 504
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
+    // Amplitude(s) for diagram number 504
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 504 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram505( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 505 OF 1240 ***
+    // Wavefunction(s) for diagram number 505
+    // (none)
+    // Amplitude(s) for diagram number 505
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 505 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram506( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 506 OF 1240 ***
+    // Wavefunction(s) for diagram number 506
+    // (none)
+    // Amplitude(s) for diagram number 506
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 506 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram507( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 507 OF 1240 ***
+    // Wavefunction(s) for diagram number 507
+    // (none)
+    // Amplitude(s) for diagram number 507
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 507 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram508( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 508 OF 1240 ***
+    // Wavefunction(s) for diagram number 508
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] );
+    // Amplitude(s) for diagram number 508
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 508 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram509( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 509 OF 1240 ***
+    // Wavefunction(s) for diagram number 509
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] );
+    // Amplitude(s) for diagram number 509
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 509 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram510( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 510 OF 1240 ***
+    // Wavefunction(s) for diagram number 510
+    // (none)
+    // Amplitude(s) for diagram number 510
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 510 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram511( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 511 OF 1240 ***
+    // Wavefunction(s) for diagram number 511
+    // (none)
+    // Amplitude(s) for diagram number 511
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 511 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram512( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 512 OF 1240 ***
+    // Wavefunction(s) for diagram number 512
+    // (none)
+    // Amplitude(s) for diagram number 512
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 512 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram513( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 513 OF 1240 ***
+    // Wavefunction(s) for diagram number 513
+    // (none)
+    // Amplitude(s) for diagram number 513
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 513 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram514( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 514 OF 1240 ***
+    // Wavefunction(s) for diagram number 514
+    // (none)
+    // Amplitude(s) for diagram number 514
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 514 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram515( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 515 OF 1240 ***
+    // Wavefunction(s) for diagram number 515
+    // (none)
+    // Amplitude(s) for diagram number 515
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram516( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 516 OF 1240 ***
+    // Wavefunction(s) for diagram number 516
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
+    // Amplitude(s) for diagram number 516
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 516 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram517( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 517 OF 1240 ***
+    // Wavefunction(s) for diagram number 517
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 517
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 517 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram518( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 518 OF 1240 ***
+    // Wavefunction(s) for diagram number 518
+    // (none)
+    // Amplitude(s) for diagram number 518
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 518 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram519( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 519 OF 1240 ***
+    // Wavefunction(s) for diagram number 519
+    // (none)
+    // Amplitude(s) for diagram number 519
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 519 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram520( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 520 OF 1240 ***
+    // Wavefunction(s) for diagram number 520
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
+    // Amplitude(s) for diagram number 520
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 520 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram521( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 521 OF 1240 ***
+    // Wavefunction(s) for diagram number 521
+    // (none)
+    // Amplitude(s) for diagram number 521
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 521 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram522( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 522 OF 1240 ***
+    // Wavefunction(s) for diagram number 522
+    // (none)
+    // Amplitude(s) for diagram number 522
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 522 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram523( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 523 OF 1240 ***
+    // Wavefunction(s) for diagram number 523
+    // (none)
+    // Amplitude(s) for diagram number 523
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 523 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram524( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 524 OF 1240 ***
+    // Wavefunction(s) for diagram number 524
+    // (none)
+    // Amplitude(s) for diagram number 524
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 524 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram525( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 525 OF 1240 ***
+    // Wavefunction(s) for diagram number 525
+    // (none)
+    // Amplitude(s) for diagram number 525
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 525 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram526( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 526 OF 1240 ***
+    // Wavefunction(s) for diagram number 526
+    // (none)
+    // Amplitude(s) for diagram number 526
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 526 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram527( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 527 OF 1240 ***
+    // Wavefunction(s) for diagram number 527
+    // (none)
+    // Amplitude(s) for diagram number 527
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 527 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram528( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 528 OF 1240 ***
+    // Wavefunction(s) for diagram number 528
+    // (none)
+    // Amplitude(s) for diagram number 528
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 528 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram529( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 529 OF 1240 ***
+    // Wavefunction(s) for diagram number 529
+    // (none)
+    // Amplitude(s) for diagram number 529
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 529 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram530( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 530 OF 1240 ***
+    // Wavefunction(s) for diagram number 530
+    // (none)
+    // Amplitude(s) for diagram number 530
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 530 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram531( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 531 OF 1240 ***
+    // Wavefunction(s) for diagram number 531
+    // (none)
+    // Amplitude(s) for diagram number 531
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram532( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 532 OF 1240 ***
+    // Wavefunction(s) for diagram number 532
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 532
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 532 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram533( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 533 OF 1240 ***
+    // Wavefunction(s) for diagram number 533
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] );
+    // Amplitude(s) for diagram number 533
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 533 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram534( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 534 OF 1240 ***
+    // Wavefunction(s) for diagram number 534
+    // (none)
+    // Amplitude(s) for diagram number 534
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 534 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram535( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 535 OF 1240 ***
+    // Wavefunction(s) for diagram number 535
+    // (none)
+    // Amplitude(s) for diagram number 535
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 535 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram536( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 536 OF 1240 ***
+    // Wavefunction(s) for diagram number 536
+    // (none)
+    // Amplitude(s) for diagram number 536
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 536 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram537( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 537 OF 1240 ***
+    // Wavefunction(s) for diagram number 537
+    // (none)
+    // Amplitude(s) for diagram number 537
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 537 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram538( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 538 OF 1240 ***
+    // Wavefunction(s) for diagram number 538
+    // (none)
+    // Amplitude(s) for diagram number 538
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 538 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram539( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 539 OF 1240 ***
+    // Wavefunction(s) for diagram number 539
+    // (none)
+    // Amplitude(s) for diagram number 539
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 539 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram540( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 540 OF 1240 ***
+    // Wavefunction(s) for diagram number 540
+    // (none)
+    // Amplitude(s) for diagram number 540
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 540 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram541( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 541 OF 1240 ***
+    // Wavefunction(s) for diagram number 541
+    // (none)
+    // Amplitude(s) for diagram number 541
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 541 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram542( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 542 OF 1240 ***
+    // Wavefunction(s) for diagram number 542
+    // (none)
+    // Amplitude(s) for diagram number 542
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 542 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram543( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 543 OF 1240 ***
+    // Wavefunction(s) for diagram number 543
+    // (none)
+    // Amplitude(s) for diagram number 543
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 543 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram544( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 544 OF 1240 ***
+    // Wavefunction(s) for diagram number 544
+    // (none)
+    // Amplitude(s) for diagram number 544
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 544 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram545( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 545 OF 1240 ***
+    // Wavefunction(s) for diagram number 545
+    // (none)
+    // Amplitude(s) for diagram number 545
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 545 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram546( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 546 OF 1240 ***
+    // Wavefunction(s) for diagram number 546
+    // (none)
+    // Amplitude(s) for diagram number 546
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 546 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram547( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 547 OF 1240 ***
+    // Wavefunction(s) for diagram number 547
+    // (none)
+    // Amplitude(s) for diagram number 547
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram548( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 548 OF 1240 ***
+    // Wavefunction(s) for diagram number 548
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 548
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 548 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram549( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 549 OF 1240 ***
+    // Wavefunction(s) for diagram number 549
+    // (none)
+    // Amplitude(s) for diagram number 549
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 549 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram550( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 550 OF 1240 ***
+    // Wavefunction(s) for diagram number 550
+    // (none)
+    // Amplitude(s) for diagram number 550
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 550 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram551( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 551 OF 1240 ***
+    // Wavefunction(s) for diagram number 551
+    // (none)
+    // Amplitude(s) for diagram number 551
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 551 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram552( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 552 OF 1240 ***
+    // Wavefunction(s) for diagram number 552
+    // (none)
+    // Amplitude(s) for diagram number 552
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 552 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram553( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 553 OF 1240 ***
+    // Wavefunction(s) for diagram number 553
+    // (none)
+    // Amplitude(s) for diagram number 553
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 553 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram554( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 554 OF 1240 ***
+    // Wavefunction(s) for diagram number 554
+    // (none)
+    // Amplitude(s) for diagram number 554
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 554 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram555( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 555 OF 1240 ***
+    // Wavefunction(s) for diagram number 555
+    // (none)
+    // Amplitude(s) for diagram number 555
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 555 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram556( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 556 OF 1240 ***
+    // Wavefunction(s) for diagram number 556
+    // (none)
+    // Amplitude(s) for diagram number 556
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 556 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram557( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 557 OF 1240 ***
+    // Wavefunction(s) for diagram number 557
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 557
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 557 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram558( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 558 OF 1240 ***
+    // Wavefunction(s) for diagram number 558
+    // (none)
+    // Amplitude(s) for diagram number 558
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 558 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram559( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 559 OF 1240 ***
+    // Wavefunction(s) for diagram number 559
+    // (none)
+    // Amplitude(s) for diagram number 559
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram560( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 560 OF 1240 ***
+    // Wavefunction(s) for diagram number 560
+    // (none)
+    // Amplitude(s) for diagram number 560
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 560 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram561( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 561 OF 1240 ***
+    // Wavefunction(s) for diagram number 561
+    // (none)
+    // Amplitude(s) for diagram number 561
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 561 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram562( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 562 OF 1240 ***
+    // Wavefunction(s) for diagram number 562
+    // (none)
+    // Amplitude(s) for diagram number 562
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram563( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 563 OF 1240 ***
+    // Wavefunction(s) for diagram number 563
+    // (none)
+    // Amplitude(s) for diagram number 563
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 563 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram564( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 564 OF 1240 ***
+    // Wavefunction(s) for diagram number 564
+    // (none)
+    // Amplitude(s) for diagram number 564
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 564 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram565( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 565 OF 1240 ***
+    // Wavefunction(s) for diagram number 565
+    // (none)
+    // Amplitude(s) for diagram number 565
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram566( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 566 OF 1240 ***
+    // Wavefunction(s) for diagram number 566
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    // Amplitude(s) for diagram number 566
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram567( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 567 OF 1240 ***
+    // Wavefunction(s) for diagram number 567
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
+    // Amplitude(s) for diagram number 567
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram568( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 568 OF 1240 ***
+    // Wavefunction(s) for diagram number 568
+    // (none)
+    // Amplitude(s) for diagram number 568
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram569( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 569 OF 1240 ***
+    // Wavefunction(s) for diagram number 569
+    // (none)
+    // Amplitude(s) for diagram number 569
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram570( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 570 OF 1240 ***
+    // Wavefunction(s) for diagram number 570
+    // (none)
+    // Amplitude(s) for diagram number 570
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 570 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram571( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 571 OF 1240 ***
+    // Wavefunction(s) for diagram number 571
+    // (none)
+    // Amplitude(s) for diagram number 571
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 571 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram572( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 572 OF 1240 ***
+    // Wavefunction(s) for diagram number 572
+    // (none)
+    // Amplitude(s) for diagram number 572
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 572 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram573( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 573 OF 1240 ***
+    // Wavefunction(s) for diagram number 573
+    // (none)
+    // Amplitude(s) for diagram number 573
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 573 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram574( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 574 OF 1240 ***
+    // Wavefunction(s) for diagram number 574
+    // (none)
+    // Amplitude(s) for diagram number 574
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 574 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram575( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 575 OF 1240 ***
+    // Wavefunction(s) for diagram number 575
+    // (none)
+    // Amplitude(s) for diagram number 575
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 575 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram576( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 576 OF 1240 ***
+    // Wavefunction(s) for diagram number 576
+    // (none)
+    // Amplitude(s) for diagram number 576
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 576 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram577( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 577 OF 1240 ***
+    // Wavefunction(s) for diagram number 577
+    // (none)
+    // Amplitude(s) for diagram number 577
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 577 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram578( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 578 OF 1240 ***
+    // Wavefunction(s) for diagram number 578
+    // (none)
+    // Amplitude(s) for diagram number 578
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 578 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram579( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 579 OF 1240 ***
+    // Wavefunction(s) for diagram number 579
+    // (none)
+    // Amplitude(s) for diagram number 579
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram580( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 580 OF 1240 ***
+    // Wavefunction(s) for diagram number 580
+    // (none)
+    // Amplitude(s) for diagram number 580
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 580 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram581( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 581 OF 1240 ***
+    // Wavefunction(s) for diagram number 581
+    // (none)
+    // Amplitude(s) for diagram number 581
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 581 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram582( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 582 OF 1240 ***
+    // Wavefunction(s) for diagram number 582
+    // (none)
+    // Amplitude(s) for diagram number 582
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 582 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram583( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 583 OF 1240 ***
+    // Wavefunction(s) for diagram number 583
+    // (none)
+    // Amplitude(s) for diagram number 583
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 583 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram584( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 584 OF 1240 ***
+    // Wavefunction(s) for diagram number 584
+    // (none)
+    // Amplitude(s) for diagram number 584
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 584 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram585( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 585 OF 1240 ***
+    // Wavefunction(s) for diagram number 585
+    // (none)
+    // Amplitude(s) for diagram number 585
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 585 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram586( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 586 OF 1240 ***
+    // Wavefunction(s) for diagram number 586
+    // (none)
+    // Amplitude(s) for diagram number 586
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 586 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram587( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 587 OF 1240 ***
+    // Wavefunction(s) for diagram number 587
+    // (none)
+    // Amplitude(s) for diagram number 587
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 587 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram588( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 588 OF 1240 ***
+    // Wavefunction(s) for diagram number 588
+    // (none)
+    // Amplitude(s) for diagram number 588
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 588 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram589( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 589 OF 1240 ***
+    // Wavefunction(s) for diagram number 589
+    // (none)
+    // Amplitude(s) for diagram number 589
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram590( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 590 OF 1240 ***
+    // Wavefunction(s) for diagram number 590
+    // (none)
+    // Amplitude(s) for diagram number 590
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 590 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram591( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 591 OF 1240 ***
+    // Wavefunction(s) for diagram number 591
+    // (none)
+    // Amplitude(s) for diagram number 591
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 591 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram592( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 592 OF 1240 ***
+    // Wavefunction(s) for diagram number 592
+    // (none)
+    // Amplitude(s) for diagram number 592
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 592 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram593( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 593 OF 1240 ***
+    // Wavefunction(s) for diagram number 593
+    // (none)
+    // Amplitude(s) for diagram number 593
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 593 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram594( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 594 OF 1240 ***
+    // Wavefunction(s) for diagram number 594
+    // (none)
+    // Amplitude(s) for diagram number 594
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 594 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram595( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 595 OF 1240 ***
+    // Wavefunction(s) for diagram number 595
+    // (none)
+    // Amplitude(s) for diagram number 595
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 595 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram596( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 596 OF 1240 ***
+    // Wavefunction(s) for diagram number 596
+    // (none)
+    // Amplitude(s) for diagram number 596
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 596 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram597( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 597 OF 1240 ***
+    // Wavefunction(s) for diagram number 597
+    // (none)
+    // Amplitude(s) for diagram number 597
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 597 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram598( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 598 OF 1240 ***
+    // Wavefunction(s) for diagram number 598
+    // (none)
+    // Amplitude(s) for diagram number 598
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 598 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram599( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 599 OF 1240 ***
+    // Wavefunction(s) for diagram number 599
+    // (none)
+    // Amplitude(s) for diagram number 599
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram600( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 600 OF 1240 ***
+    // Wavefunction(s) for diagram number 600
+    // (none)
+    // Amplitude(s) for diagram number 600
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 600 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram601( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 601 OF 1240 ***
+    // Wavefunction(s) for diagram number 601
+    // (none)
+    // Amplitude(s) for diagram number 601
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 601 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram602( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 602 OF 1240 ***
+    // Wavefunction(s) for diagram number 602
+    // (none)
+    // Amplitude(s) for diagram number 602
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 602 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram603( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 603 OF 1240 ***
+    // Wavefunction(s) for diagram number 603
+    // (none)
+    // Amplitude(s) for diagram number 603
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 603 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram604( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 604 OF 1240 ***
+    // Wavefunction(s) for diagram number 604
+    // (none)
+    // Amplitude(s) for diagram number 604
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 604 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram605( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 605 OF 1240 ***
+    // Wavefunction(s) for diagram number 605
+    // (none)
+    // Amplitude(s) for diagram number 605
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 605 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram606( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 606 OF 1240 ***
+    // Wavefunction(s) for diagram number 606
+    // (none)
+    // Amplitude(s) for diagram number 606
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram607( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 607 OF 1240 ***
+    // Wavefunction(s) for diagram number 607
+    // (none)
+    // Amplitude(s) for diagram number 607
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 607 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram608( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 608 OF 1240 ***
+    // Wavefunction(s) for diagram number 608
+    // (none)
+    // Amplitude(s) for diagram number 608
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 608 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram609( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 609 OF 1240 ***
+    // Wavefunction(s) for diagram number 609
+    // (none)
+    // Amplitude(s) for diagram number 609
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 609 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram610( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 610 OF 1240 ***
+    // Wavefunction(s) for diagram number 610
+    // (none)
+    // Amplitude(s) for diagram number 610
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 610 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram611( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 611 OF 1240 ***
+    // Wavefunction(s) for diagram number 611
+    // (none)
+    // Amplitude(s) for diagram number 611
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 611 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram612( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 612 OF 1240 ***
+    // Wavefunction(s) for diagram number 612
+    // (none)
+    // Amplitude(s) for diagram number 612
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 612 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram613( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 613 OF 1240 ***
+    // Wavefunction(s) for diagram number 613
+    // (none)
+    // Amplitude(s) for diagram number 613
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram614( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 614 OF 1240 ***
+    // Wavefunction(s) for diagram number 614
+    // (none)
+    // Amplitude(s) for diagram number 614
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram615( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 615 OF 1240 ***
+    // Wavefunction(s) for diagram number 615
+    // (none)
+    // Amplitude(s) for diagram number 615
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram616( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 616 OF 1240 ***
+    // Wavefunction(s) for diagram number 616
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 616
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 616 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram617( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 617 OF 1240 ***
+    // Wavefunction(s) for diagram number 617
+    // (none)
+    // Amplitude(s) for diagram number 617
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 617 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram618( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 618 OF 1240 ***
+    // Wavefunction(s) for diagram number 618
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] );
+    // Amplitude(s) for diagram number 618
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 618 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram619( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 619 OF 1240 ***
+    // Wavefunction(s) for diagram number 619
+    // (none)
+    // Amplitude(s) for diagram number 619
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 619 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram620( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 620 OF 1240 ***
+    // Wavefunction(s) for diagram number 620
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 620
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 620 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram621( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 621 OF 1240 ***
+    // Wavefunction(s) for diagram number 621
+    // (none)
+    // Amplitude(s) for diagram number 621
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 621 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram622( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 622 OF 1240 ***
+    // Wavefunction(s) for diagram number 622
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    // Amplitude(s) for diagram number 622
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram623( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 623 OF 1240 ***
+    // Wavefunction(s) for diagram number 623
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
+    // Amplitude(s) for diagram number 623
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 623 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram624( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 624 OF 1240 ***
+    // Wavefunction(s) for diagram number 624
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
+    // Amplitude(s) for diagram number 624
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 624 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram625( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 625 OF 1240 ***
+    // Wavefunction(s) for diagram number 625
+    // (none)
+    // Amplitude(s) for diagram number 625
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 625 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram626( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 626 OF 1240 ***
+    // Wavefunction(s) for diagram number 626
+    // (none)
+    // Amplitude(s) for diagram number 626
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 626 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram627( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 627 OF 1240 ***
+    // Wavefunction(s) for diagram number 627
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
+    // Amplitude(s) for diagram number 627
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 627 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram628( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 628 OF 1240 ***
+    // Wavefunction(s) for diagram number 628
+    // (none)
+    // Amplitude(s) for diagram number 628
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 628 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram629( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 629 OF 1240 ***
+    // Wavefunction(s) for diagram number 629
+    // (none)
+    // Amplitude(s) for diagram number 629
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 629 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram630( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 630 OF 1240 ***
+    // Wavefunction(s) for diagram number 630
+    // (none)
+    // Amplitude(s) for diagram number 630
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 630 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram631( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 631 OF 1240 ***
+    // Wavefunction(s) for diagram number 631
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] );
+    // Amplitude(s) for diagram number 631
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 631 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram632( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 632 OF 1240 ***
+    // Wavefunction(s) for diagram number 632
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] );
+    // Amplitude(s) for diagram number 632
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 632 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram633( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 633 OF 1240 ***
+    // Wavefunction(s) for diagram number 633
+    // (none)
+    // Amplitude(s) for diagram number 633
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 633 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram634( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 634 OF 1240 ***
+    // Wavefunction(s) for diagram number 634
+    // (none)
+    // Amplitude(s) for diagram number 634
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 634 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram635( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 635 OF 1240 ***
+    // Wavefunction(s) for diagram number 635
+    // (none)
+    // Amplitude(s) for diagram number 635
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 635 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram636( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 636 OF 1240 ***
+    // Wavefunction(s) for diagram number 636
+    // (none)
+    // Amplitude(s) for diagram number 636
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 636 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram637( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 637 OF 1240 ***
+    // Wavefunction(s) for diagram number 637
+    // (none)
+    // Amplitude(s) for diagram number 637
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 637 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram638( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 638 OF 1240 ***
+    // Wavefunction(s) for diagram number 638
+    // (none)
+    // Amplitude(s) for diagram number 638
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram639( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 639 OF 1240 ***
+    // Wavefunction(s) for diagram number 639
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
+    // Amplitude(s) for diagram number 639
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 639 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram640( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 640 OF 1240 ***
+    // Wavefunction(s) for diagram number 640
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
+    // Amplitude(s) for diagram number 640
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 640 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram641( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 641 OF 1240 ***
+    // Wavefunction(s) for diagram number 641
+    // (none)
+    // Amplitude(s) for diagram number 641
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 641 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram642( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 642 OF 1240 ***
+    // Wavefunction(s) for diagram number 642
+    // (none)
+    // Amplitude(s) for diagram number 642
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 642 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram643( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 643 OF 1240 ***
+    // Wavefunction(s) for diagram number 643
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
+    // Amplitude(s) for diagram number 643
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 643 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram644( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 644 OF 1240 ***
+    // Wavefunction(s) for diagram number 644
+    // (none)
+    // Amplitude(s) for diagram number 644
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 644 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram645( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 645 OF 1240 ***
+    // Wavefunction(s) for diagram number 645
+    // (none)
+    // Amplitude(s) for diagram number 645
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 645 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram646( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 646 OF 1240 ***
+    // Wavefunction(s) for diagram number 646
+    // (none)
+    // Amplitude(s) for diagram number 646
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 646 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram647( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 647 OF 1240 ***
+    // Wavefunction(s) for diagram number 647
+    // (none)
+    // Amplitude(s) for diagram number 647
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 647 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram648( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 648 OF 1240 ***
+    // Wavefunction(s) for diagram number 648
+    // (none)
+    // Amplitude(s) for diagram number 648
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 648 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram649( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 649 OF 1240 ***
+    // Wavefunction(s) for diagram number 649
+    // (none)
+    // Amplitude(s) for diagram number 649
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 649 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram650( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 650 OF 1240 ***
+    // Wavefunction(s) for diagram number 650
+    // (none)
+    // Amplitude(s) for diagram number 650
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 650 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram651( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 651 OF 1240 ***
+    // Wavefunction(s) for diagram number 651
+    // (none)
+    // Amplitude(s) for diagram number 651
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 651 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram652( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 652 OF 1240 ***
+    // Wavefunction(s) for diagram number 652
+    // (none)
+    // Amplitude(s) for diagram number 652
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 652 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram653( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 653 OF 1240 ***
+    // Wavefunction(s) for diagram number 653
+    // (none)
+    // Amplitude(s) for diagram number 653
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 653 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram654( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 654 OF 1240 ***
+    // Wavefunction(s) for diagram number 654
+    // (none)
+    // Amplitude(s) for diagram number 654
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram655( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 655 OF 1240 ***
+    // Wavefunction(s) for diagram number 655
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 655
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 655 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram656( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 656 OF 1240 ***
+    // Wavefunction(s) for diagram number 656
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] );
+    // Amplitude(s) for diagram number 656
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 656 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram657( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 657 OF 1240 ***
+    // Wavefunction(s) for diagram number 657
+    // (none)
+    // Amplitude(s) for diagram number 657
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 657 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram658( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 658 OF 1240 ***
+    // Wavefunction(s) for diagram number 658
+    // (none)
+    // Amplitude(s) for diagram number 658
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 658 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram659( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 659 OF 1240 ***
+    // Wavefunction(s) for diagram number 659
+    // (none)
+    // Amplitude(s) for diagram number 659
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 659 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram660( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 660 OF 1240 ***
+    // Wavefunction(s) for diagram number 660
+    // (none)
+    // Amplitude(s) for diagram number 660
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 660 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram661( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 661 OF 1240 ***
+    // Wavefunction(s) for diagram number 661
+    // (none)
+    // Amplitude(s) for diagram number 661
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 661 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram662( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 662 OF 1240 ***
+    // Wavefunction(s) for diagram number 662
+    // (none)
+    // Amplitude(s) for diagram number 662
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 662 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram663( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 663 OF 1240 ***
+    // Wavefunction(s) for diagram number 663
+    // (none)
+    // Amplitude(s) for diagram number 663
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 663 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram664( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 664 OF 1240 ***
+    // Wavefunction(s) for diagram number 664
+    // (none)
+    // Amplitude(s) for diagram number 664
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 664 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram665( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 665 OF 1240 ***
+    // Wavefunction(s) for diagram number 665
+    // (none)
+    // Amplitude(s) for diagram number 665
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 665 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram666( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 666 OF 1240 ***
+    // Wavefunction(s) for diagram number 666
+    // (none)
+    // Amplitude(s) for diagram number 666
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 666 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram667( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 667 OF 1240 ***
+    // Wavefunction(s) for diagram number 667
+    // (none)
+    // Amplitude(s) for diagram number 667
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 667 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram668( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 668 OF 1240 ***
+    // Wavefunction(s) for diagram number 668
+    // (none)
+    // Amplitude(s) for diagram number 668
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 668 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram669( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 669 OF 1240 ***
+    // Wavefunction(s) for diagram number 669
+    // (none)
+    // Amplitude(s) for diagram number 669
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 669 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram670( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 670 OF 1240 ***
+    // Wavefunction(s) for diagram number 670
+    // (none)
+    // Amplitude(s) for diagram number 670
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram671( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 671 OF 1240 ***
+    // Wavefunction(s) for diagram number 671
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 671
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 671 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram672( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 672 OF 1240 ***
+    // Wavefunction(s) for diagram number 672
+    // (none)
+    // Amplitude(s) for diagram number 672
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 672 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram673( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 673 OF 1240 ***
+    // Wavefunction(s) for diagram number 673
+    // (none)
+    // Amplitude(s) for diagram number 673
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 673 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram674( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 674 OF 1240 ***
+    // Wavefunction(s) for diagram number 674
+    // (none)
+    // Amplitude(s) for diagram number 674
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 674 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram675( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 675 OF 1240 ***
+    // Wavefunction(s) for diagram number 675
+    // (none)
+    // Amplitude(s) for diagram number 675
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 675 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram676( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 676 OF 1240 ***
+    // Wavefunction(s) for diagram number 676
+    // (none)
+    // Amplitude(s) for diagram number 676
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 676 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram677( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 677 OF 1240 ***
+    // Wavefunction(s) for diagram number 677
+    // (none)
+    // Amplitude(s) for diagram number 677
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 677 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram678( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 678 OF 1240 ***
+    // Wavefunction(s) for diagram number 678
+    // (none)
+    // Amplitude(s) for diagram number 678
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 678 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram679( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 679 OF 1240 ***
+    // Wavefunction(s) for diagram number 679
+    // (none)
+    // Amplitude(s) for diagram number 679
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 679 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram680( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 680 OF 1240 ***
+    // Wavefunction(s) for diagram number 680
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 680
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 680 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram681( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 681 OF 1240 ***
+    // Wavefunction(s) for diagram number 681
+    // (none)
+    // Amplitude(s) for diagram number 681
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 681 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram682( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 682 OF 1240 ***
+    // Wavefunction(s) for diagram number 682
+    // (none)
+    // Amplitude(s) for diagram number 682
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram683( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 683 OF 1240 ***
+    // Wavefunction(s) for diagram number 683
+    // (none)
+    // Amplitude(s) for diagram number 683
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 683 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram684( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 684 OF 1240 ***
+    // Wavefunction(s) for diagram number 684
+    // (none)
+    // Amplitude(s) for diagram number 684
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 684 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram685( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 685 OF 1240 ***
+    // Wavefunction(s) for diagram number 685
+    // (none)
+    // Amplitude(s) for diagram number 685
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram686( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 686 OF 1240 ***
+    // Wavefunction(s) for diagram number 686
+    // (none)
+    // Amplitude(s) for diagram number 686
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 686 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram687( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 687 OF 1240 ***
+    // Wavefunction(s) for diagram number 687
+    // (none)
+    // Amplitude(s) for diagram number 687
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 687 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram688( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 688 OF 1240 ***
+    // Wavefunction(s) for diagram number 688
+    // (none)
+    // Amplitude(s) for diagram number 688
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram689( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 689 OF 1240 ***
+    // Wavefunction(s) for diagram number 689
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] );
+    // Amplitude(s) for diagram number 689
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram690( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 690 OF 1240 ***
+    // Wavefunction(s) for diagram number 690
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 690
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram691( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 691 OF 1240 ***
+    // Wavefunction(s) for diagram number 691
+    // (none)
+    // Amplitude(s) for diagram number 691
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram692( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 692 OF 1240 ***
+    // Wavefunction(s) for diagram number 692
+    // (none)
+    // Amplitude(s) for diagram number 692
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram693( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 693 OF 1240 ***
+    // Wavefunction(s) for diagram number 693
+    // (none)
+    // Amplitude(s) for diagram number 693
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 693 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram694( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 694 OF 1240 ***
+    // Wavefunction(s) for diagram number 694
+    // (none)
+    // Amplitude(s) for diagram number 694
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 694 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram695( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 695 OF 1240 ***
+    // Wavefunction(s) for diagram number 695
+    // (none)
+    // Amplitude(s) for diagram number 695
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 695 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram696( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 696 OF 1240 ***
+    // Wavefunction(s) for diagram number 696
+    // (none)
+    // Amplitude(s) for diagram number 696
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 696 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram697( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 697 OF 1240 ***
+    // Wavefunction(s) for diagram number 697
+    // (none)
+    // Amplitude(s) for diagram number 697
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 697 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram698( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 698 OF 1240 ***
+    // Wavefunction(s) for diagram number 698
+    // (none)
+    // Amplitude(s) for diagram number 698
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 698 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram699( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 699 OF 1240 ***
+    // Wavefunction(s) for diagram number 699
+    // (none)
+    // Amplitude(s) for diagram number 699
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 699 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram700( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 700 OF 1240 ***
+    // Wavefunction(s) for diagram number 700
+    // (none)
+    // Amplitude(s) for diagram number 700
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 700 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram701( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 701 OF 1240 ***
+    // Wavefunction(s) for diagram number 701
+    // (none)
+    // Amplitude(s) for diagram number 701
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 701 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram702( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 702 OF 1240 ***
+    // Wavefunction(s) for diagram number 702
+    // (none)
+    // Amplitude(s) for diagram number 702
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram703( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 703 OF 1240 ***
+    // Wavefunction(s) for diagram number 703
+    // (none)
+    // Amplitude(s) for diagram number 703
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 703 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram704( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 704 OF 1240 ***
+    // Wavefunction(s) for diagram number 704
+    // (none)
+    // Amplitude(s) for diagram number 704
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 704 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram705( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 705 OF 1240 ***
+    // Wavefunction(s) for diagram number 705
+    // (none)
+    // Amplitude(s) for diagram number 705
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 705 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram706( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 706 OF 1240 ***
+    // Wavefunction(s) for diagram number 706
+    // (none)
+    // Amplitude(s) for diagram number 706
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 706 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram707( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 707 OF 1240 ***
+    // Wavefunction(s) for diagram number 707
+    // (none)
+    // Amplitude(s) for diagram number 707
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 707 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram708( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 708 OF 1240 ***
+    // Wavefunction(s) for diagram number 708
+    // (none)
+    // Amplitude(s) for diagram number 708
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 708 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram709( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 709 OF 1240 ***
+    // Wavefunction(s) for diagram number 709
+    // (none)
+    // Amplitude(s) for diagram number 709
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 709 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram710( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 710 OF 1240 ***
+    // Wavefunction(s) for diagram number 710
+    // (none)
+    // Amplitude(s) for diagram number 710
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 710 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram711( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 711 OF 1240 ***
+    // Wavefunction(s) for diagram number 711
+    // (none)
+    // Amplitude(s) for diagram number 711
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 711 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram712( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 712 OF 1240 ***
+    // Wavefunction(s) for diagram number 712
+    // (none)
+    // Amplitude(s) for diagram number 712
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram713( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 713 OF 1240 ***
+    // Wavefunction(s) for diagram number 713
+    // (none)
+    // Amplitude(s) for diagram number 713
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 713 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram714( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 714 OF 1240 ***
+    // Wavefunction(s) for diagram number 714
+    // (none)
+    // Amplitude(s) for diagram number 714
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 714 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram715( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 715 OF 1240 ***
+    // Wavefunction(s) for diagram number 715
+    // (none)
+    // Amplitude(s) for diagram number 715
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 715 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram716( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 716 OF 1240 ***
+    // Wavefunction(s) for diagram number 716
+    // (none)
+    // Amplitude(s) for diagram number 716
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 716 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram717( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 717 OF 1240 ***
+    // Wavefunction(s) for diagram number 717
+    // (none)
+    // Amplitude(s) for diagram number 717
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 717 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram718( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 718 OF 1240 ***
+    // Wavefunction(s) for diagram number 718
+    // (none)
+    // Amplitude(s) for diagram number 718
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 718 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram719( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 719 OF 1240 ***
+    // Wavefunction(s) for diagram number 719
+    // (none)
+    // Amplitude(s) for diagram number 719
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 719 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram720( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 720 OF 1240 ***
+    // Wavefunction(s) for diagram number 720
+    // (none)
+    // Amplitude(s) for diagram number 720
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 720 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram721( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 721 OF 1240 ***
+    // Wavefunction(s) for diagram number 721
+    // (none)
+    // Amplitude(s) for diagram number 721
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 721 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram722( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 722 OF 1240 ***
+    // Wavefunction(s) for diagram number 722
+    // (none)
+    // Amplitude(s) for diagram number 722
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram723( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 723 OF 1240 ***
+    // Wavefunction(s) for diagram number 723
+    // (none)
+    // Amplitude(s) for diagram number 723
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 723 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram724( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 724 OF 1240 ***
+    // Wavefunction(s) for diagram number 724
+    // (none)
+    // Amplitude(s) for diagram number 724
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 724 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram725( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 725 OF 1240 ***
+    // Wavefunction(s) for diagram number 725
+    // (none)
+    // Amplitude(s) for diagram number 725
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 725 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram726( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 726 OF 1240 ***
+    // Wavefunction(s) for diagram number 726
+    // (none)
+    // Amplitude(s) for diagram number 726
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 726 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram727( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 727 OF 1240 ***
+    // Wavefunction(s) for diagram number 727
+    // (none)
+    // Amplitude(s) for diagram number 727
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 727 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram728( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 728 OF 1240 ***
+    // Wavefunction(s) for diagram number 728
+    // (none)
+    // Amplitude(s) for diagram number 728
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 728 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram729( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 729 OF 1240 ***
+    // Wavefunction(s) for diagram number 729
+    // (none)
+    // Amplitude(s) for diagram number 729
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram730( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 730 OF 1240 ***
+    // Wavefunction(s) for diagram number 730
+    // (none)
+    // Amplitude(s) for diagram number 730
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 730 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram731( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 731 OF 1240 ***
+    // Wavefunction(s) for diagram number 731
+    // (none)
+    // Amplitude(s) for diagram number 731
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 731 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram732( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 732 OF 1240 ***
+    // Wavefunction(s) for diagram number 732
+    // (none)
+    // Amplitude(s) for diagram number 732
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 732 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram733( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 733 OF 1240 ***
+    // Wavefunction(s) for diagram number 733
+    // (none)
+    // Amplitude(s) for diagram number 733
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 733 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram734( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 734 OF 1240 ***
+    // Wavefunction(s) for diagram number 734
+    // (none)
+    // Amplitude(s) for diagram number 734
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 734 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram735( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 735 OF 1240 ***
+    // Wavefunction(s) for diagram number 735
+    // (none)
+    // Amplitude(s) for diagram number 735
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 735 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram736( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 736 OF 1240 ***
+    // Wavefunction(s) for diagram number 736
+    // (none)
+    // Amplitude(s) for diagram number 736
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram737( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 737 OF 1240 ***
+    // Wavefunction(s) for diagram number 737
+    // (none)
+    // Amplitude(s) for diagram number 737
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram738( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 738 OF 1240 ***
+    // Wavefunction(s) for diagram number 738
+    // (none)
+    // Amplitude(s) for diagram number 738
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram739( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 739 OF 1240 ***
+    // Wavefunction(s) for diagram number 739
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] );
+    // Amplitude(s) for diagram number 739
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 739 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram740( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 740 OF 1240 ***
+    // Wavefunction(s) for diagram number 740
+    // (none)
+    // Amplitude(s) for diagram number 740
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 740 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram741( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 741 OF 1240 ***
+    // Wavefunction(s) for diagram number 741
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 741
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 741 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram742( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 742 OF 1240 ***
+    // Wavefunction(s) for diagram number 742
+    // (none)
+    // Amplitude(s) for diagram number 742
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 742 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram743( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 743 OF 1240 ***
+    // Wavefunction(s) for diagram number 743
+    // (none)
+    // Amplitude(s) for diagram number 743
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 743 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram744( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 744 OF 1240 ***
+    // Wavefunction(s) for diagram number 744
+    // (none)
+    // Amplitude(s) for diagram number 744
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 744 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram745( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 745 OF 1240 ***
+    // Wavefunction(s) for diagram number 745
+    // (none)
+    // Amplitude(s) for diagram number 745
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 745 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram746( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 746 OF 1240 ***
+    // Wavefunction(s) for diagram number 746
+    // (none)
+    // Amplitude(s) for diagram number 746
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 746 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram747( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 747 OF 1240 ***
+    // Wavefunction(s) for diagram number 747
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] );
+    // Amplitude(s) for diagram number 747
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 747 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram748( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 748 OF 1240 ***
+    // Wavefunction(s) for diagram number 748
+    // (none)
+    // Amplitude(s) for diagram number 748
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 748 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram749( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 749 OF 1240 ***
+    // Wavefunction(s) for diagram number 749
+    // (none)
+    // Amplitude(s) for diagram number 749
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 749 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram750( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 750 OF 1240 ***
+    // Wavefunction(s) for diagram number 750
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
+    // Amplitude(s) for diagram number 750
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 750 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram751( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 751 OF 1240 ***
+    // Wavefunction(s) for diagram number 751
+    // (none)
+    // Amplitude(s) for diagram number 751
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 751 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram752( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 752 OF 1240 ***
+    // Wavefunction(s) for diagram number 752
+    // (none)
+    // Amplitude(s) for diagram number 752
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 752 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram753( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 753 OF 1240 ***
+    // Wavefunction(s) for diagram number 753
+    // (none)
+    // Amplitude(s) for diagram number 753
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 753 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram754( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 754 OF 1240 ***
+    // Wavefunction(s) for diagram number 754
+    // (none)
+    // Amplitude(s) for diagram number 754
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 754 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram755( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 755 OF 1240 ***
+    // Wavefunction(s) for diagram number 755
+    // (none)
+    // Amplitude(s) for diagram number 755
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 755 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram756( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 756 OF 1240 ***
+    // Wavefunction(s) for diagram number 756
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] );
+    // Amplitude(s) for diagram number 756
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 756 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram757( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 757 OF 1240 ***
+    // Wavefunction(s) for diagram number 757
+    // (none)
+    // Amplitude(s) for diagram number 757
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 757 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram758( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 758 OF 1240 ***
+    // Wavefunction(s) for diagram number 758
+    // (none)
+    // Amplitude(s) for diagram number 758
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 758 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram759( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 759 OF 1240 ***
+    // Wavefunction(s) for diagram number 759
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+    // Amplitude(s) for diagram number 759
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 759 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram760( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 760 OF 1240 ***
+    // Wavefunction(s) for diagram number 760
+    // (none)
+    // Amplitude(s) for diagram number 760
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 760 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram761( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 761 OF 1240 ***
+    // Wavefunction(s) for diagram number 761
+    // (none)
+    // Amplitude(s) for diagram number 761
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 761 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram762( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 762 OF 1240 ***
+    // Wavefunction(s) for diagram number 762
+    // (none)
+    // Amplitude(s) for diagram number 762
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 762 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram763( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 763 OF 1240 ***
+    // Wavefunction(s) for diagram number 763
+    // (none)
+    // Amplitude(s) for diagram number 763
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 763 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram764( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 764 OF 1240 ***
+    // Wavefunction(s) for diagram number 764
+    // (none)
+    // Amplitude(s) for diagram number 764
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 764 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram765( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 765 OF 1240 ***
+    // Wavefunction(s) for diagram number 765
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] );
+    // Amplitude(s) for diagram number 765
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 765 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram766( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 766 OF 1240 ***
+    // Wavefunction(s) for diagram number 766
+    // (none)
+    // Amplitude(s) for diagram number 766
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 766 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram767( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 767 OF 1240 ***
+    // Wavefunction(s) for diagram number 767
+    // (none)
+    // Amplitude(s) for diagram number 767
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 767 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram768( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 768 OF 1240 ***
+    // Wavefunction(s) for diagram number 768
+    // (none)
+    // Amplitude(s) for diagram number 768
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 768 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram769( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 769 OF 1240 ***
+    // Wavefunction(s) for diagram number 769
+    // (none)
+    // Amplitude(s) for diagram number 769
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 769 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram770( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 770 OF 1240 ***
+    // Wavefunction(s) for diagram number 770
+    // (none)
+    // Amplitude(s) for diagram number 770
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 770 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram771( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 771 OF 1240 ***
+    // Wavefunction(s) for diagram number 771
+    // (none)
+    // Amplitude(s) for diagram number 771
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 771 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram772( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 772 OF 1240 ***
+    // Wavefunction(s) for diagram number 772
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 772
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram773( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 773 OF 1240 ***
+    // Wavefunction(s) for diagram number 773
+    // (none)
+    // Amplitude(s) for diagram number 773
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 773 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram774( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 774 OF 1240 ***
+    // Wavefunction(s) for diagram number 774
+    // (none)
+    // Amplitude(s) for diagram number 774
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 774 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram775( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 775 OF 1240 ***
+    // Wavefunction(s) for diagram number 775
+    // (none)
+    // Amplitude(s) for diagram number 775
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 775 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram776( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 776 OF 1240 ***
+    // Wavefunction(s) for diagram number 776
+    // (none)
+    // Amplitude(s) for diagram number 776
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 776 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram777( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 777 OF 1240 ***
+    // Wavefunction(s) for diagram number 777
+    // (none)
+    // Amplitude(s) for diagram number 777
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 777 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram778( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 778 OF 1240 ***
+    // Wavefunction(s) for diagram number 778
+    // (none)
+    // Amplitude(s) for diagram number 778
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 778 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram779( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 779 OF 1240 ***
+    // Wavefunction(s) for diagram number 779
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    // Amplitude(s) for diagram number 779
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram780( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 780 OF 1240 ***
+    // Wavefunction(s) for diagram number 780
+    // (none)
+    // Amplitude(s) for diagram number 780
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 780 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram781( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 781 OF 1240 ***
+    // Wavefunction(s) for diagram number 781
+    // (none)
+    // Amplitude(s) for diagram number 781
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 781 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram782( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 782 OF 1240 ***
+    // Wavefunction(s) for diagram number 782
+    // (none)
+    // Amplitude(s) for diagram number 782
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 782 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram783( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 783 OF 1240 ***
+    // Wavefunction(s) for diagram number 783
+    // (none)
+    // Amplitude(s) for diagram number 783
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 783 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram784( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 784 OF 1240 ***
+    // Wavefunction(s) for diagram number 784
+    // (none)
+    // Amplitude(s) for diagram number 784
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 784 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram785( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 785 OF 1240 ***
+    // Wavefunction(s) for diagram number 785
+    // (none)
+    // Amplitude(s) for diagram number 785
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 785 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram786( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 786 OF 1240 ***
+    // Wavefunction(s) for diagram number 786
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 786
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram787( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 787 OF 1240 ***
+    // Wavefunction(s) for diagram number 787
+    // (none)
+    // Amplitude(s) for diagram number 787
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram788( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 788 OF 1240 ***
+    // Wavefunction(s) for diagram number 788
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] );
+    // Amplitude(s) for diagram number 788
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram789( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 789 OF 1240 ***
+    // Wavefunction(s) for diagram number 789
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
+    // Amplitude(s) for diagram number 789
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 789 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram790( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 790 OF 1240 ***
+    // Wavefunction(s) for diagram number 790
+    // (none)
+    // Amplitude(s) for diagram number 790
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 790 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram791( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 791 OF 1240 ***
+    // Wavefunction(s) for diagram number 791
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
+    // Amplitude(s) for diagram number 791
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 791 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram792( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 792 OF 1240 ***
+    // Wavefunction(s) for diagram number 792
+    // (none)
+    // Amplitude(s) for diagram number 792
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 792 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram793( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 793 OF 1240 ***
+    // Wavefunction(s) for diagram number 793
+    // (none)
+    // Amplitude(s) for diagram number 793
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 793 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram794( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 794 OF 1240 ***
+    // Wavefunction(s) for diagram number 794
+    // (none)
+    // Amplitude(s) for diagram number 794
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 794 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram795( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 795 OF 1240 ***
+    // Wavefunction(s) for diagram number 795
+    // (none)
+    // Amplitude(s) for diagram number 795
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 795 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram796( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 796 OF 1240 ***
+    // Wavefunction(s) for diagram number 796
+    // (none)
+    // Amplitude(s) for diagram number 796
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 796 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram797( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 797 OF 1240 ***
+    // Wavefunction(s) for diagram number 797
+    // (none)
+    // Amplitude(s) for diagram number 797
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 797 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram798( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 798 OF 1240 ***
+    // Wavefunction(s) for diagram number 798
+    // (none)
+    // Amplitude(s) for diagram number 798
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 798 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram799( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 799 OF 1240 ***
+    // Wavefunction(s) for diagram number 799
+    // (none)
+    // Amplitude(s) for diagram number 799
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 799 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram800( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 800 OF 1240 ***
+    // Wavefunction(s) for diagram number 800
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
+    // Amplitude(s) for diagram number 800
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 800 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram801( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 801 OF 1240 ***
+    // Wavefunction(s) for diagram number 801
+    // (none)
+    // Amplitude(s) for diagram number 801
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 801 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram802( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 802 OF 1240 ***
+    // Wavefunction(s) for diagram number 802
+    // (none)
+    // Amplitude(s) for diagram number 802
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 802 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram803( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 803 OF 1240 ***
+    // Wavefunction(s) for diagram number 803
+    // (none)
+    // Amplitude(s) for diagram number 803
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 803 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram804( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 804 OF 1240 ***
+    // Wavefunction(s) for diagram number 804
+    // (none)
+    // Amplitude(s) for diagram number 804
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 804 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram805( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 805 OF 1240 ***
+    // Wavefunction(s) for diagram number 805
+    // (none)
+    // Amplitude(s) for diagram number 805
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 805 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram806( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 806 OF 1240 ***
+    // Wavefunction(s) for diagram number 806
+    // (none)
+    // Amplitude(s) for diagram number 806
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 806 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram807( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 807 OF 1240 ***
+    // Wavefunction(s) for diagram number 807
+    // (none)
+    // Amplitude(s) for diagram number 807
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 807 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram808( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 808 OF 1240 ***
+    // Wavefunction(s) for diagram number 808
+    // (none)
+    // Amplitude(s) for diagram number 808
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 808 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram809( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 809 OF 1240 ***
+    // Wavefunction(s) for diagram number 809
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
+    // Amplitude(s) for diagram number 809
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 809 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram810( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 810 OF 1240 ***
+    // Wavefunction(s) for diagram number 810
+    // (none)
+    // Amplitude(s) for diagram number 810
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 810 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram811( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 811 OF 1240 ***
+    // Wavefunction(s) for diagram number 811
+    // (none)
+    // Amplitude(s) for diagram number 811
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 811 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram812( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 812 OF 1240 ***
+    // Wavefunction(s) for diagram number 812
+    // (none)
+    // Amplitude(s) for diagram number 812
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 812 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram813( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 813 OF 1240 ***
+    // Wavefunction(s) for diagram number 813
+    // (none)
+    // Amplitude(s) for diagram number 813
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 813 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram814( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 814 OF 1240 ***
+    // Wavefunction(s) for diagram number 814
+    // (none)
+    // Amplitude(s) for diagram number 814
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 814 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram815( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 815 OF 1240 ***
+    // Wavefunction(s) for diagram number 815
+    // (none)
+    // Amplitude(s) for diagram number 815
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 815 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram816( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 816 OF 1240 ***
+    // Wavefunction(s) for diagram number 816
+    // (none)
+    // Amplitude(s) for diagram number 816
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 816 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram817( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 817 OF 1240 ***
+    // Wavefunction(s) for diagram number 817
+    // (none)
+    // Amplitude(s) for diagram number 817
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 817 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram818( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 818 OF 1240 ***
+    // Wavefunction(s) for diagram number 818
+    // (none)
+    // Amplitude(s) for diagram number 818
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 818 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram819( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 819 OF 1240 ***
+    // Wavefunction(s) for diagram number 819
+    // (none)
+    // Amplitude(s) for diagram number 819
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 819 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram820( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 820 OF 1240 ***
+    // Wavefunction(s) for diagram number 820
+    // (none)
+    // Amplitude(s) for diagram number 820
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 820 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram821( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 821 OF 1240 ***
+    // Wavefunction(s) for diagram number 821
+    // (none)
+    // Amplitude(s) for diagram number 821
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 821 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram822( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 822 OF 1240 ***
+    // Wavefunction(s) for diagram number 822
+    // (none)
+    // Amplitude(s) for diagram number 822
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram823( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 823 OF 1240 ***
+    // Wavefunction(s) for diagram number 823
+    // (none)
+    // Amplitude(s) for diagram number 823
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 823 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram824( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 824 OF 1240 ***
+    // Wavefunction(s) for diagram number 824
+    // (none)
+    // Amplitude(s) for diagram number 824
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 824 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram825( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 825 OF 1240 ***
+    // Wavefunction(s) for diagram number 825
+    // (none)
+    // Amplitude(s) for diagram number 825
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 825 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram826( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 826 OF 1240 ***
+    // Wavefunction(s) for diagram number 826
+    // (none)
+    // Amplitude(s) for diagram number 826
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 826 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram827( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 827 OF 1240 ***
+    // Wavefunction(s) for diagram number 827
+    // (none)
+    // Amplitude(s) for diagram number 827
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 827 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram828( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 828 OF 1240 ***
+    // Wavefunction(s) for diagram number 828
+    // (none)
+    // Amplitude(s) for diagram number 828
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 828 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram829( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 829 OF 1240 ***
+    // Wavefunction(s) for diagram number 829
+    // (none)
+    // Amplitude(s) for diagram number 829
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram830( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 830 OF 1240 ***
+    // Wavefunction(s) for diagram number 830
+    // (none)
+    // Amplitude(s) for diagram number 830
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 830 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram831( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 831 OF 1240 ***
+    // Wavefunction(s) for diagram number 831
+    // (none)
+    // Amplitude(s) for diagram number 831
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 831 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram832( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 832 OF 1240 ***
+    // Wavefunction(s) for diagram number 832
+    // (none)
+    // Amplitude(s) for diagram number 832
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 832 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram833( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 833 OF 1240 ***
+    // Wavefunction(s) for diagram number 833
+    // (none)
+    // Amplitude(s) for diagram number 833
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 833 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram834( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 834 OF 1240 ***
+    // Wavefunction(s) for diagram number 834
+    // (none)
+    // Amplitude(s) for diagram number 834
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 834 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram835( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 835 OF 1240 ***
+    // Wavefunction(s) for diagram number 835
+    // (none)
+    // Amplitude(s) for diagram number 835
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 835 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram836( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 836 OF 1240 ***
+    // Wavefunction(s) for diagram number 836
+    // (none)
+    // Amplitude(s) for diagram number 836
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram837( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 837 OF 1240 ***
+    // Wavefunction(s) for diagram number 837
+    // (none)
+    // Amplitude(s) for diagram number 837
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram838( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 838 OF 1240 ***
+    // Wavefunction(s) for diagram number 838
+    // (none)
+    // Amplitude(s) for diagram number 838
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram839( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 839 OF 1240 ***
+    // Wavefunction(s) for diagram number 839
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] );
+    // Amplitude(s) for diagram number 839
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 839 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram840( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 840 OF 1240 ***
+    // Wavefunction(s) for diagram number 840
+    // (none)
+    // Amplitude(s) for diagram number 840
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 840 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram841( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 841 OF 1240 ***
+    // Wavefunction(s) for diagram number 841
+    // (none)
+    // Amplitude(s) for diagram number 841
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram842( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 842 OF 1240 ***
+    // Wavefunction(s) for diagram number 842
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] );
+    // Amplitude(s) for diagram number 842
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 842 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram843( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 843 OF 1240 ***
+    // Wavefunction(s) for diagram number 843
+    // (none)
+    // Amplitude(s) for diagram number 843
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 843 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram844( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 844 OF 1240 ***
+    // Wavefunction(s) for diagram number 844
+    // (none)
+    // Amplitude(s) for diagram number 844
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram845( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 845 OF 1240 ***
+    // Wavefunction(s) for diagram number 845
+    // (none)
+    // Amplitude(s) for diagram number 845
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 845 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram846( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 846 OF 1240 ***
+    // Wavefunction(s) for diagram number 846
+    // (none)
+    // Amplitude(s) for diagram number 846
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 846 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram847( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 847 OF 1240 ***
+    // Wavefunction(s) for diagram number 847
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 847
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram848( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 848 OF 1240 ***
+    // Wavefunction(s) for diagram number 848
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    // Amplitude(s) for diagram number 848
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram849( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 849 OF 1240 ***
+    // Wavefunction(s) for diagram number 849
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] );
+    // Amplitude(s) for diagram number 849
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram850( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 850 OF 1240 ***
+    // Wavefunction(s) for diagram number 850
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] );
+    // Amplitude(s) for diagram number 850
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram851( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 851 OF 1240 ***
+    // Wavefunction(s) for diagram number 851
+    // (none)
+    // Amplitude(s) for diagram number 851
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram852( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 852 OF 1240 ***
+    // Wavefunction(s) for diagram number 852
+    // (none)
+    // Amplitude(s) for diagram number 852
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 852 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram853( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 853 OF 1240 ***
+    // Wavefunction(s) for diagram number 853
+    // (none)
+    // Amplitude(s) for diagram number 853
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 853 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram854( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 854 OF 1240 ***
+    // Wavefunction(s) for diagram number 854
+    // (none)
+    // Amplitude(s) for diagram number 854
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 854 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram855( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 855 OF 1240 ***
+    // Wavefunction(s) for diagram number 855
+    // (none)
+    // Amplitude(s) for diagram number 855
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 855 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram856( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 856 OF 1240 ***
+    // Wavefunction(s) for diagram number 856
+    // (none)
+    // Amplitude(s) for diagram number 856
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 856 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram857( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 857 OF 1240 ***
+    // Wavefunction(s) for diagram number 857
+    // (none)
+    // Amplitude(s) for diagram number 857
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 857 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram858( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 858 OF 1240 ***
+    // Wavefunction(s) for diagram number 858
+    // (none)
+    // Amplitude(s) for diagram number 858
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 858 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram859( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 859 OF 1240 ***
+    // Wavefunction(s) for diagram number 859
+    // (none)
+    // Amplitude(s) for diagram number 859
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 859 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram860( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 860 OF 1240 ***
+    // Wavefunction(s) for diagram number 860
+    // (none)
+    // Amplitude(s) for diagram number 860
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 860 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram861( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 861 OF 1240 ***
+    // Wavefunction(s) for diagram number 861
+    // (none)
+    // Amplitude(s) for diagram number 861
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram862( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 862 OF 1240 ***
+    // Wavefunction(s) for diagram number 862
+    // (none)
+    // Amplitude(s) for diagram number 862
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 862 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram863( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 863 OF 1240 ***
+    // Wavefunction(s) for diagram number 863
+    // (none)
+    // Amplitude(s) for diagram number 863
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 863 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram864( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 864 OF 1240 ***
+    // Wavefunction(s) for diagram number 864
+    // (none)
+    // Amplitude(s) for diagram number 864
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 864 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram865( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 865 OF 1240 ***
+    // Wavefunction(s) for diagram number 865
+    // (none)
+    // Amplitude(s) for diagram number 865
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 865 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram866( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 866 OF 1240 ***
+    // Wavefunction(s) for diagram number 866
+    // (none)
+    // Amplitude(s) for diagram number 866
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 866 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram867( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 867 OF 1240 ***
+    // Wavefunction(s) for diagram number 867
+    // (none)
+    // Amplitude(s) for diagram number 867
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 867 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram868( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 868 OF 1240 ***
+    // Wavefunction(s) for diagram number 868
+    // (none)
+    // Amplitude(s) for diagram number 868
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 868 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram869( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 869 OF 1240 ***
+    // Wavefunction(s) for diagram number 869
+    // (none)
+    // Amplitude(s) for diagram number 869
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 869 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram870( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 870 OF 1240 ***
+    // Wavefunction(s) for diagram number 870
+    // (none)
+    // Amplitude(s) for diagram number 870
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 870 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram871( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 871 OF 1240 ***
+    // Wavefunction(s) for diagram number 871
+    // (none)
+    // Amplitude(s) for diagram number 871
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram872( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 872 OF 1240 ***
+    // Wavefunction(s) for diagram number 872
+    // (none)
+    // Amplitude(s) for diagram number 872
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 872 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram873( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 873 OF 1240 ***
+    // Wavefunction(s) for diagram number 873
+    // (none)
+    // Amplitude(s) for diagram number 873
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 873 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram874( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 874 OF 1240 ***
+    // Wavefunction(s) for diagram number 874
+    // (none)
+    // Amplitude(s) for diagram number 874
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 874 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram875( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 875 OF 1240 ***
+    // Wavefunction(s) for diagram number 875
+    // (none)
+    // Amplitude(s) for diagram number 875
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 875 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram876( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 876 OF 1240 ***
+    // Wavefunction(s) for diagram number 876
+    // (none)
+    // Amplitude(s) for diagram number 876
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 876 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram877( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 877 OF 1240 ***
+    // Wavefunction(s) for diagram number 877
+    // (none)
+    // Amplitude(s) for diagram number 877
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 877 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram878( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 878 OF 1240 ***
+    // Wavefunction(s) for diagram number 878
+    // (none)
+    // Amplitude(s) for diagram number 878
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 878 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram879( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 879 OF 1240 ***
+    // Wavefunction(s) for diagram number 879
+    // (none)
+    // Amplitude(s) for diagram number 879
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 879 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram880( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 880 OF 1240 ***
+    // Wavefunction(s) for diagram number 880
+    // (none)
+    // Amplitude(s) for diagram number 880
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 880 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram881( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 881 OF 1240 ***
+    // Wavefunction(s) for diagram number 881
+    // (none)
+    // Amplitude(s) for diagram number 881
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram882( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 882 OF 1240 ***
+    // Wavefunction(s) for diagram number 882
+    // (none)
+    // Amplitude(s) for diagram number 882
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 882 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram883( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 883 OF 1240 ***
+    // Wavefunction(s) for diagram number 883
+    // (none)
+    // Amplitude(s) for diagram number 883
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 883 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram884( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 884 OF 1240 ***
+    // Wavefunction(s) for diagram number 884
+    // (none)
+    // Amplitude(s) for diagram number 884
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 884 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram885( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 885 OF 1240 ***
+    // Wavefunction(s) for diagram number 885
+    // (none)
+    // Amplitude(s) for diagram number 885
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 885 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram886( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 886 OF 1240 ***
+    // Wavefunction(s) for diagram number 886
+    // (none)
+    // Amplitude(s) for diagram number 886
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 886 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram887( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 887 OF 1240 ***
+    // Wavefunction(s) for diagram number 887
+    // (none)
+    // Amplitude(s) for diagram number 887
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 887 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram888( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 888 OF 1240 ***
+    // Wavefunction(s) for diagram number 888
+    // (none)
+    // Amplitude(s) for diagram number 888
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram889( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 889 OF 1240 ***
+    // Wavefunction(s) for diagram number 889
+    // (none)
+    // Amplitude(s) for diagram number 889
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 889 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram890( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 890 OF 1240 ***
+    // Wavefunction(s) for diagram number 890
+    // (none)
+    // Amplitude(s) for diagram number 890
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 890 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram891( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 891 OF 1240 ***
+    // Wavefunction(s) for diagram number 891
+    // (none)
+    // Amplitude(s) for diagram number 891
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 891 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram892( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 892 OF 1240 ***
+    // Wavefunction(s) for diagram number 892
+    // (none)
+    // Amplitude(s) for diagram number 892
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 892 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram893( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 893 OF 1240 ***
+    // Wavefunction(s) for diagram number 893
+    // (none)
+    // Amplitude(s) for diagram number 893
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 893 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram894( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 894 OF 1240 ***
+    // Wavefunction(s) for diagram number 894
+    // (none)
+    // Amplitude(s) for diagram number 894
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 894 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram895( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 895 OF 1240 ***
+    // Wavefunction(s) for diagram number 895
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] );
+    // Amplitude(s) for diagram number 895
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 895 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram896( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 896 OF 1240 ***
+    // Wavefunction(s) for diagram number 896
+    // (none)
+    // Amplitude(s) for diagram number 896
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 896 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram897( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 897 OF 1240 ***
+    // Wavefunction(s) for diagram number 897
+    // (none)
+    // Amplitude(s) for diagram number 897
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram898( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 898 OF 1240 ***
+    // Wavefunction(s) for diagram number 898
+    // (none)
+    // Amplitude(s) for diagram number 898
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 898 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram899( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 899 OF 1240 ***
+    // Wavefunction(s) for diagram number 899
+    // (none)
+    // Amplitude(s) for diagram number 899
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 899 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram900( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 900 OF 1240 ***
+    // Wavefunction(s) for diagram number 900
+    // (none)
+    // Amplitude(s) for diagram number 900
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram901( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 901 OF 1240 ***
+    // Wavefunction(s) for diagram number 901
+    // (none)
+    // Amplitude(s) for diagram number 901
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 901 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram902( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 902 OF 1240 ***
+    // Wavefunction(s) for diagram number 902
+    // (none)
+    // Amplitude(s) for diagram number 902
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 902 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram903( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 903 OF 1240 ***
+    // Wavefunction(s) for diagram number 903
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 903
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram904( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 904 OF 1240 ***
+    // Wavefunction(s) for diagram number 904
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] );
+    // Amplitude(s) for diagram number 904
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram905( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 905 OF 1240 ***
+    // Wavefunction(s) for diagram number 905
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    // Amplitude(s) for diagram number 905
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram906( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 906 OF 1240 ***
+    // Wavefunction(s) for diagram number 906
+    // (none)
+    // Amplitude(s) for diagram number 906
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram907( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 907 OF 1240 ***
+    // Wavefunction(s) for diagram number 907
+    // (none)
+    // Amplitude(s) for diagram number 907
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram908( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 908 OF 1240 ***
+    // Wavefunction(s) for diagram number 908
+    // (none)
+    // Amplitude(s) for diagram number 908
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 908 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram909( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 909 OF 1240 ***
+    // Wavefunction(s) for diagram number 909
+    // (none)
+    // Amplitude(s) for diagram number 909
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 909 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram910( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 910 OF 1240 ***
+    // Wavefunction(s) for diagram number 910
+    // (none)
+    // Amplitude(s) for diagram number 910
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 910 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram911( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 911 OF 1240 ***
+    // Wavefunction(s) for diagram number 911
+    // (none)
+    // Amplitude(s) for diagram number 911
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 911 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram912( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 912 OF 1240 ***
+    // Wavefunction(s) for diagram number 912
+    // (none)
+    // Amplitude(s) for diagram number 912
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 912 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram913( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 913 OF 1240 ***
+    // Wavefunction(s) for diagram number 913
+    // (none)
+    // Amplitude(s) for diagram number 913
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 913 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram914( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 914 OF 1240 ***
+    // Wavefunction(s) for diagram number 914
+    // (none)
+    // Amplitude(s) for diagram number 914
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 914 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram915( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 915 OF 1240 ***
+    // Wavefunction(s) for diagram number 915
+    // (none)
+    // Amplitude(s) for diagram number 915
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 915 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram916( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 916 OF 1240 ***
+    // Wavefunction(s) for diagram number 916
+    // (none)
+    // Amplitude(s) for diagram number 916
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 916 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram917( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 917 OF 1240 ***
+    // Wavefunction(s) for diagram number 917
+    // (none)
+    // Amplitude(s) for diagram number 917
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram918( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 918 OF 1240 ***
+    // Wavefunction(s) for diagram number 918
+    // (none)
+    // Amplitude(s) for diagram number 918
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 918 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram919( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 919 OF 1240 ***
+    // Wavefunction(s) for diagram number 919
+    // (none)
+    // Amplitude(s) for diagram number 919
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 919 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram920( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 920 OF 1240 ***
+    // Wavefunction(s) for diagram number 920
+    // (none)
+    // Amplitude(s) for diagram number 920
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 920 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram921( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 921 OF 1240 ***
+    // Wavefunction(s) for diagram number 921
+    // (none)
+    // Amplitude(s) for diagram number 921
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 921 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram922( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 922 OF 1240 ***
+    // Wavefunction(s) for diagram number 922
+    // (none)
+    // Amplitude(s) for diagram number 922
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 922 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram923( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 923 OF 1240 ***
+    // Wavefunction(s) for diagram number 923
+    // (none)
+    // Amplitude(s) for diagram number 923
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 923 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram924( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 924 OF 1240 ***
+    // Wavefunction(s) for diagram number 924
+    // (none)
+    // Amplitude(s) for diagram number 924
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 924 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram925( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 925 OF 1240 ***
+    // Wavefunction(s) for diagram number 925
+    // (none)
+    // Amplitude(s) for diagram number 925
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 925 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram926( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 926 OF 1240 ***
+    // Wavefunction(s) for diagram number 926
+    // (none)
+    // Amplitude(s) for diagram number 926
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 926 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram927( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 927 OF 1240 ***
+    // Wavefunction(s) for diagram number 927
+    // (none)
+    // Amplitude(s) for diagram number 927
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram928( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 928 OF 1240 ***
+    // Wavefunction(s) for diagram number 928
+    // (none)
+    // Amplitude(s) for diagram number 928
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 928 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram929( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 929 OF 1240 ***
+    // Wavefunction(s) for diagram number 929
+    // (none)
+    // Amplitude(s) for diagram number 929
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 929 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram930( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 930 OF 1240 ***
+    // Wavefunction(s) for diagram number 930
+    // (none)
+    // Amplitude(s) for diagram number 930
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 930 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram931( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 931 OF 1240 ***
+    // Wavefunction(s) for diagram number 931
+    // (none)
+    // Amplitude(s) for diagram number 931
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 931 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram932( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 932 OF 1240 ***
+    // Wavefunction(s) for diagram number 932
+    // (none)
+    // Amplitude(s) for diagram number 932
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 932 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram933( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 933 OF 1240 ***
+    // Wavefunction(s) for diagram number 933
+    // (none)
+    // Amplitude(s) for diagram number 933
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 933 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram934( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 934 OF 1240 ***
+    // Wavefunction(s) for diagram number 934
+    // (none)
+    // Amplitude(s) for diagram number 934
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 934 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram935( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 935 OF 1240 ***
+    // Wavefunction(s) for diagram number 935
+    // (none)
+    // Amplitude(s) for diagram number 935
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 935 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram936( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 936 OF 1240 ***
+    // Wavefunction(s) for diagram number 936
+    // (none)
+    // Amplitude(s) for diagram number 936
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 936 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram937( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 937 OF 1240 ***
+    // Wavefunction(s) for diagram number 937
+    // (none)
+    // Amplitude(s) for diagram number 937
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram938( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 938 OF 1240 ***
+    // Wavefunction(s) for diagram number 938
+    // (none)
+    // Amplitude(s) for diagram number 938
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 938 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram939( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 939 OF 1240 ***
+    // Wavefunction(s) for diagram number 939
+    // (none)
+    // Amplitude(s) for diagram number 939
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 939 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram940( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 940 OF 1240 ***
+    // Wavefunction(s) for diagram number 940
+    // (none)
+    // Amplitude(s) for diagram number 940
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 940 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram941( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 941 OF 1240 ***
+    // Wavefunction(s) for diagram number 941
+    // (none)
+    // Amplitude(s) for diagram number 941
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 941 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram942( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 942 OF 1240 ***
+    // Wavefunction(s) for diagram number 942
+    // (none)
+    // Amplitude(s) for diagram number 942
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 942 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram943( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 943 OF 1240 ***
+    // Wavefunction(s) for diagram number 943
+    // (none)
+    // Amplitude(s) for diagram number 943
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 943 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram944( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 944 OF 1240 ***
+    // Wavefunction(s) for diagram number 944
+    // (none)
+    // Amplitude(s) for diagram number 944
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram945( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 945 OF 1240 ***
+    // Wavefunction(s) for diagram number 945
+    // (none)
+    // Amplitude(s) for diagram number 945
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 945 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram946( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 946 OF 1240 ***
+    // Wavefunction(s) for diagram number 946
+    // (none)
+    // Amplitude(s) for diagram number 946
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 946 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram947( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 947 OF 1240 ***
+    // Wavefunction(s) for diagram number 947
+    // (none)
+    // Amplitude(s) for diagram number 947
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 947 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram948( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 948 OF 1240 ***
+    // Wavefunction(s) for diagram number 948
+    // (none)
+    // Amplitude(s) for diagram number 948
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 948 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram949( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 949 OF 1240 ***
+    // Wavefunction(s) for diagram number 949
+    // (none)
+    // Amplitude(s) for diagram number 949
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 949 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram950( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 950 OF 1240 ***
+    // Wavefunction(s) for diagram number 950
+    // (none)
+    // Amplitude(s) for diagram number 950
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 950 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram951( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 951 OF 1240 ***
+    // Wavefunction(s) for diagram number 951
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    // Amplitude(s) for diagram number 951
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 951 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram952( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 952 OF 1240 ***
+    // Wavefunction(s) for diagram number 952
+    // (none)
+    // Amplitude(s) for diagram number 952
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 952 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram953( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 953 OF 1240 ***
+    // Wavefunction(s) for diagram number 953
+    // (none)
+    // Amplitude(s) for diagram number 953
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram954( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 954 OF 1240 ***
+    // Wavefunction(s) for diagram number 954
+    // (none)
+    // Amplitude(s) for diagram number 954
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 954 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram955( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 955 OF 1240 ***
+    // Wavefunction(s) for diagram number 955
+    // (none)
+    // Amplitude(s) for diagram number 955
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 955 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram956( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 956 OF 1240 ***
+    // Wavefunction(s) for diagram number 956
+    // (none)
+    // Amplitude(s) for diagram number 956
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram957( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 957 OF 1240 ***
+    // Wavefunction(s) for diagram number 957
+    // (none)
+    // Amplitude(s) for diagram number 957
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 957 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram958( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 958 OF 1240 ***
+    // Wavefunction(s) for diagram number 958
+    // (none)
+    // Amplitude(s) for diagram number 958
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 958 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram959( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 959 OF 1240 ***
+    // Wavefunction(s) for diagram number 959
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 959
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram960( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 960 OF 1240 ***
+    // Wavefunction(s) for diagram number 960
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] );
+    // Amplitude(s) for diagram number 960
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram961( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 961 OF 1240 ***
+    // Wavefunction(s) for diagram number 961
+    // (none)
+    // Amplitude(s) for diagram number 961
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram962( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 962 OF 1240 ***
+    // Wavefunction(s) for diagram number 962
+    // (none)
+    // Amplitude(s) for diagram number 962
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram963( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 963 OF 1240 ***
+    // Wavefunction(s) for diagram number 963
+    // (none)
+    // Amplitude(s) for diagram number 963
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram964( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 964 OF 1240 ***
+    // Wavefunction(s) for diagram number 964
+    // (none)
+    // Amplitude(s) for diagram number 964
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 964 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram965( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 965 OF 1240 ***
+    // Wavefunction(s) for diagram number 965
+    // (none)
+    // Amplitude(s) for diagram number 965
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 965 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram966( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 966 OF 1240 ***
+    // Wavefunction(s) for diagram number 966
+    // (none)
+    // Amplitude(s) for diagram number 966
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 966 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram967( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 967 OF 1240 ***
+    // Wavefunction(s) for diagram number 967
+    // (none)
+    // Amplitude(s) for diagram number 967
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 967 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram968( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 968 OF 1240 ***
+    // Wavefunction(s) for diagram number 968
+    // (none)
+    // Amplitude(s) for diagram number 968
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 968 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram969( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 969 OF 1240 ***
+    // Wavefunction(s) for diagram number 969
+    // (none)
+    // Amplitude(s) for diagram number 969
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 969 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram970( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 970 OF 1240 ***
+    // Wavefunction(s) for diagram number 970
+    // (none)
+    // Amplitude(s) for diagram number 970
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 970 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram971( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 971 OF 1240 ***
+    // Wavefunction(s) for diagram number 971
+    // (none)
+    // Amplitude(s) for diagram number 971
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 971 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram972( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 972 OF 1240 ***
+    // Wavefunction(s) for diagram number 972
+    // (none)
+    // Amplitude(s) for diagram number 972
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 972 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram973( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 973 OF 1240 ***
+    // Wavefunction(s) for diagram number 973
+    // (none)
+    // Amplitude(s) for diagram number 973
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram974( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 974 OF 1240 ***
+    // Wavefunction(s) for diagram number 974
+    // (none)
+    // Amplitude(s) for diagram number 974
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 974 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram975( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 975 OF 1240 ***
+    // Wavefunction(s) for diagram number 975
+    // (none)
+    // Amplitude(s) for diagram number 975
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 975 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram976( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 976 OF 1240 ***
+    // Wavefunction(s) for diagram number 976
+    // (none)
+    // Amplitude(s) for diagram number 976
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 976 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram977( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 977 OF 1240 ***
+    // Wavefunction(s) for diagram number 977
+    // (none)
+    // Amplitude(s) for diagram number 977
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 977 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram978( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 978 OF 1240 ***
+    // Wavefunction(s) for diagram number 978
+    // (none)
+    // Amplitude(s) for diagram number 978
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 978 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram979( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 979 OF 1240 ***
+    // Wavefunction(s) for diagram number 979
+    // (none)
+    // Amplitude(s) for diagram number 979
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 979 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram980( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 980 OF 1240 ***
+    // Wavefunction(s) for diagram number 980
+    // (none)
+    // Amplitude(s) for diagram number 980
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 980 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram981( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 981 OF 1240 ***
+    // Wavefunction(s) for diagram number 981
+    // (none)
+    // Amplitude(s) for diagram number 981
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 981 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram982( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 982 OF 1240 ***
+    // Wavefunction(s) for diagram number 982
+    // (none)
+    // Amplitude(s) for diagram number 982
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 982 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram983( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 983 OF 1240 ***
+    // Wavefunction(s) for diagram number 983
+    // (none)
+    // Amplitude(s) for diagram number 983
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram984( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 984 OF 1240 ***
+    // Wavefunction(s) for diagram number 984
+    // (none)
+    // Amplitude(s) for diagram number 984
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 984 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram985( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 985 OF 1240 ***
+    // Wavefunction(s) for diagram number 985
+    // (none)
+    // Amplitude(s) for diagram number 985
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 985 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram986( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 986 OF 1240 ***
+    // Wavefunction(s) for diagram number 986
+    // (none)
+    // Amplitude(s) for diagram number 986
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 986 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram987( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 987 OF 1240 ***
+    // Wavefunction(s) for diagram number 987
+    // (none)
+    // Amplitude(s) for diagram number 987
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 987 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram988( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 988 OF 1240 ***
+    // Wavefunction(s) for diagram number 988
+    // (none)
+    // Amplitude(s) for diagram number 988
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 988 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram989( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 989 OF 1240 ***
+    // Wavefunction(s) for diagram number 989
+    // (none)
+    // Amplitude(s) for diagram number 989
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 989 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram990( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 990 OF 1240 ***
+    // Wavefunction(s) for diagram number 990
+    // (none)
+    // Amplitude(s) for diagram number 990
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 990 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram991( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 991 OF 1240 ***
+    // Wavefunction(s) for diagram number 991
+    // (none)
+    // Amplitude(s) for diagram number 991
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 991 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram992( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 992 OF 1240 ***
+    // Wavefunction(s) for diagram number 992
+    // (none)
+    // Amplitude(s) for diagram number 992
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 992 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram993( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 993 OF 1240 ***
+    // Wavefunction(s) for diagram number 993
+    // (none)
+    // Amplitude(s) for diagram number 993
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram994( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 994 OF 1240 ***
+    // Wavefunction(s) for diagram number 994
+    // (none)
+    // Amplitude(s) for diagram number 994
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 994 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram995( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 995 OF 1240 ***
+    // Wavefunction(s) for diagram number 995
+    // (none)
+    // Amplitude(s) for diagram number 995
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 995 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram996( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 996 OF 1240 ***
+    // Wavefunction(s) for diagram number 996
+    // (none)
+    // Amplitude(s) for diagram number 996
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 996 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram997( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 997 OF 1240 ***
+    // Wavefunction(s) for diagram number 997
+    // (none)
+    // Amplitude(s) for diagram number 997
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 997 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram998( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 998 OF 1240 ***
+    // Wavefunction(s) for diagram number 998
+    // (none)
+    // Amplitude(s) for diagram number 998
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 998 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram999( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 999 OF 1240 ***
+    // Wavefunction(s) for diagram number 999
+    // (none)
+    // Amplitude(s) for diagram number 999
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 999 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1000( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1000 OF 1240 ***
+    // Wavefunction(s) for diagram number 1000
+    // (none)
+    // Amplitude(s) for diagram number 1000
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1001( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1001 OF 1240 ***
+    // Wavefunction(s) for diagram number 1001
+    // (none)
+    // Amplitude(s) for diagram number 1001
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1001 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1002( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1002 OF 1240 ***
+    // Wavefunction(s) for diagram number 1002
+    // (none)
+    // Amplitude(s) for diagram number 1002
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1002 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1003( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1003 OF 1240 ***
+    // Wavefunction(s) for diagram number 1003
+    // (none)
+    // Amplitude(s) for diagram number 1003
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1003 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1004( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1004 OF 1240 ***
+    // Wavefunction(s) for diagram number 1004
+    // (none)
+    // Amplitude(s) for diagram number 1004
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1004 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1005( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1005 OF 1240 ***
+    // Wavefunction(s) for diagram number 1005
+    // (none)
+    // Amplitude(s) for diagram number 1005
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1005 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1006( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1006 OF 1240 ***
+    // Wavefunction(s) for diagram number 1006
+    // (none)
+    // Amplitude(s) for diagram number 1006
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1006 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1007( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1007 OF 1240 ***
+    // Wavefunction(s) for diagram number 1007
+    // (none)
+    // Amplitude(s) for diagram number 1007
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1007 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1008( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1008 OF 1240 ***
+    // Wavefunction(s) for diagram number 1008
+    // (none)
+    // Amplitude(s) for diagram number 1008
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1008 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1009( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1009 OF 1240 ***
+    // Wavefunction(s) for diagram number 1009
+    // (none)
+    // Amplitude(s) for diagram number 1009
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1010( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1010 OF 1240 ***
+    // Wavefunction(s) for diagram number 1010
+    // (none)
+    // Amplitude(s) for diagram number 1010
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1010 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1011( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1011 OF 1240 ***
+    // Wavefunction(s) for diagram number 1011
+    // (none)
+    // Amplitude(s) for diagram number 1011
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1011 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1012( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1012 OF 1240 ***
+    // Wavefunction(s) for diagram number 1012
+    // (none)
+    // Amplitude(s) for diagram number 1012
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1013( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1013 OF 1240 ***
+    // Wavefunction(s) for diagram number 1013
+    // (none)
+    // Amplitude(s) for diagram number 1013
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1013 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1014( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1014 OF 1240 ***
+    // Wavefunction(s) for diagram number 1014
+    // (none)
+    // Amplitude(s) for diagram number 1014
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1014 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1015( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1015 OF 1240 ***
+    // Wavefunction(s) for diagram number 1015
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] );
+    // Amplitude(s) for diagram number 1015
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1016( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1016 OF 1240 ***
+    // Wavefunction(s) for diagram number 1016
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 1016
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1017( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1017 OF 1240 ***
+    // Wavefunction(s) for diagram number 1017
+    // (none)
+    // Amplitude(s) for diagram number 1017
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1018( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1018 OF 1240 ***
+    // Wavefunction(s) for diagram number 1018
+    // (none)
+    // Amplitude(s) for diagram number 1018
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1019( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1019 OF 1240 ***
+    // Wavefunction(s) for diagram number 1019
+    // (none)
+    // Amplitude(s) for diagram number 1019
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1019 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1020( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1020 OF 1240 ***
+    // Wavefunction(s) for diagram number 1020
+    // (none)
+    // Amplitude(s) for diagram number 1020
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1020 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1021( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1021 OF 1240 ***
+    // Wavefunction(s) for diagram number 1021
+    // (none)
+    // Amplitude(s) for diagram number 1021
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1022( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1022 OF 1240 ***
+    // Wavefunction(s) for diagram number 1022
+    // (none)
+    // Amplitude(s) for diagram number 1022
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1022 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1023( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1023 OF 1240 ***
+    // Wavefunction(s) for diagram number 1023
+    // (none)
+    // Amplitude(s) for diagram number 1023
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1023 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1024( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1024 OF 1240 ***
+    // Wavefunction(s) for diagram number 1024
+    // (none)
+    // Amplitude(s) for diagram number 1024
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1025( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1025 OF 1240 ***
+    // Wavefunction(s) for diagram number 1025
+    // (none)
+    // Amplitude(s) for diagram number 1025
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1025 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1026( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1026 OF 1240 ***
+    // Wavefunction(s) for diagram number 1026
+    // (none)
+    // Amplitude(s) for diagram number 1026
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1026 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1027( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1027 OF 1240 ***
+    // Wavefunction(s) for diagram number 1027
+    // (none)
+    // Amplitude(s) for diagram number 1027
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1028( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1028 OF 1240 ***
+    // Wavefunction(s) for diagram number 1028
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 1028
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1029( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1029 OF 1240 ***
+    // Wavefunction(s) for diagram number 1029
+    // (none)
+    // Amplitude(s) for diagram number 1029
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1030( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1030 OF 1240 ***
+    // Wavefunction(s) for diagram number 1030
+    // (none)
+    // Amplitude(s) for diagram number 1030
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1031( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1031 OF 1240 ***
+    // Wavefunction(s) for diagram number 1031
+    // (none)
+    // Amplitude(s) for diagram number 1031
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1031 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1032( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1032 OF 1240 ***
+    // Wavefunction(s) for diagram number 1032
+    // (none)
+    // Amplitude(s) for diagram number 1032
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1032 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1033( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1033 OF 1240 ***
+    // Wavefunction(s) for diagram number 1033
+    // (none)
+    // Amplitude(s) for diagram number 1033
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1034( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1034 OF 1240 ***
+    // Wavefunction(s) for diagram number 1034
+    // (none)
+    // Amplitude(s) for diagram number 1034
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1034 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1035( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1035 OF 1240 ***
+    // Wavefunction(s) for diagram number 1035
+    // (none)
+    // Amplitude(s) for diagram number 1035
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1035 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1036( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1036 OF 1240 ***
+    // Wavefunction(s) for diagram number 1036
+    // (none)
+    // Amplitude(s) for diagram number 1036
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1037( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1037 OF 1240 ***
+    // Wavefunction(s) for diagram number 1037
+    // (none)
+    // Amplitude(s) for diagram number 1037
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1037 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1038( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1038 OF 1240 ***
+    // Wavefunction(s) for diagram number 1038
+    // (none)
+    // Amplitude(s) for diagram number 1038
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1038 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1039( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1039 OF 1240 ***
+    // Wavefunction(s) for diagram number 1039
+    // (none)
+    // Amplitude(s) for diagram number 1039
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1040( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1040 OF 1240 ***
+    // Wavefunction(s) for diagram number 1040
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 1040
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1041( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1041 OF 1240 ***
+    // Wavefunction(s) for diagram number 1041
+    // (none)
+    // Amplitude(s) for diagram number 1041
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1042( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1042 OF 1240 ***
+    // Wavefunction(s) for diagram number 1042
+    // (none)
+    // Amplitude(s) for diagram number 1042
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1043( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1043 OF 1240 ***
+    // Wavefunction(s) for diagram number 1043
+    // (none)
+    // Amplitude(s) for diagram number 1043
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1044( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1044 OF 1240 ***
+    // Wavefunction(s) for diagram number 1044
+    // (none)
+    // Amplitude(s) for diagram number 1044
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1045( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1045 OF 1240 ***
+    // Wavefunction(s) for diagram number 1045
+    // (none)
+    // Amplitude(s) for diagram number 1045
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1046( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1046 OF 1240 ***
+    // Wavefunction(s) for diagram number 1046
+    // (none)
+    // Amplitude(s) for diagram number 1046
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1046 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1047( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1047 OF 1240 ***
+    // Wavefunction(s) for diagram number 1047
+    // (none)
+    // Amplitude(s) for diagram number 1047
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1047 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1048( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1048 OF 1240 ***
+    // Wavefunction(s) for diagram number 1048
+    // (none)
+    // Amplitude(s) for diagram number 1048
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1048 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1049( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1049 OF 1240 ***
+    // Wavefunction(s) for diagram number 1049
+    // (none)
+    // Amplitude(s) for diagram number 1049
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1049 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1050( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1050 OF 1240 ***
+    // Wavefunction(s) for diagram number 1050
+    // (none)
+    // Amplitude(s) for diagram number 1050
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1050 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1051( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1051 OF 1240 ***
+    // Wavefunction(s) for diagram number 1051
+    // (none)
+    // Amplitude(s) for diagram number 1051
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1051 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1052( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1052 OF 1240 ***
+    // Wavefunction(s) for diagram number 1052
+    // (none)
+    // Amplitude(s) for diagram number 1052
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1052 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1053( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1053 OF 1240 ***
+    // Wavefunction(s) for diagram number 1053
+    // (none)
+    // Amplitude(s) for diagram number 1053
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1053 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1054( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1054 OF 1240 ***
+    // Wavefunction(s) for diagram number 1054
+    // (none)
+    // Amplitude(s) for diagram number 1054
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1054 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1055( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1055 OF 1240 ***
+    // Wavefunction(s) for diagram number 1055
+    // (none)
+    // Amplitude(s) for diagram number 1055
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1055 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1056( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1056 OF 1240 ***
+    // Wavefunction(s) for diagram number 1056
+    // (none)
+    // Amplitude(s) for diagram number 1056
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1056 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1057( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1057 OF 1240 ***
+    // Wavefunction(s) for diagram number 1057
+    // (none)
+    // Amplitude(s) for diagram number 1057
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1057 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1058( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1058 OF 1240 ***
+    // Wavefunction(s) for diagram number 1058
+    // (none)
+    // Amplitude(s) for diagram number 1058
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1058 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1059( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1059 OF 1240 ***
+    // Wavefunction(s) for diagram number 1059
+    // (none)
+    // Amplitude(s) for diagram number 1059
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1059 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1060( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1060 OF 1240 ***
+    // Wavefunction(s) for diagram number 1060
+    // (none)
+    // Amplitude(s) for diagram number 1060
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1060 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1061( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1061 OF 1240 ***
+    // Wavefunction(s) for diagram number 1061
+    // (none)
+    // Amplitude(s) for diagram number 1061
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1061 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1062( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1062 OF 1240 ***
+    // Wavefunction(s) for diagram number 1062
+    // (none)
+    // Amplitude(s) for diagram number 1062
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1062 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1063( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1063 OF 1240 ***
+    // Wavefunction(s) for diagram number 1063
+    // (none)
+    // Amplitude(s) for diagram number 1063
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1063 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1064( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1064 OF 1240 ***
+    // Wavefunction(s) for diagram number 1064
+    // (none)
+    // Amplitude(s) for diagram number 1064
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1065( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1065 OF 1240 ***
+    // Wavefunction(s) for diagram number 1065
+    // (none)
+    // Amplitude(s) for diagram number 1065
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1065 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1066( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1066 OF 1240 ***
+    // Wavefunction(s) for diagram number 1066
+    // (none)
+    // Amplitude(s) for diagram number 1066
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1066 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1067( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1067 OF 1240 ***
+    // Wavefunction(s) for diagram number 1067
+    // (none)
+    // Amplitude(s) for diagram number 1067
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1067 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1068( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1068 OF 1240 ***
+    // Wavefunction(s) for diagram number 1068
+    // (none)
+    // Amplitude(s) for diagram number 1068
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1068 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1069( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1069 OF 1240 ***
+    // Wavefunction(s) for diagram number 1069
+    // (none)
+    // Amplitude(s) for diagram number 1069
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1069 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1070( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1070 OF 1240 ***
+    // Wavefunction(s) for diagram number 1070
+    // (none)
+    // Amplitude(s) for diagram number 1070
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1070 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1071( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1071 OF 1240 ***
+    // Wavefunction(s) for diagram number 1071
+    // (none)
+    // Amplitude(s) for diagram number 1071
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1071 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1072( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1072 OF 1240 ***
+    // Wavefunction(s) for diagram number 1072
+    // (none)
+    // Amplitude(s) for diagram number 1072
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1072 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1073( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1073 OF 1240 ***
+    // Wavefunction(s) for diagram number 1073
+    // (none)
+    // Amplitude(s) for diagram number 1073
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1073 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1074( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1074 OF 1240 ***
+    // Wavefunction(s) for diagram number 1074
+    // (none)
+    // Amplitude(s) for diagram number 1074
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1074 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1075( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1075 OF 1240 ***
+    // Wavefunction(s) for diagram number 1075
+    // (none)
+    // Amplitude(s) for diagram number 1075
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1075 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1076( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1076 OF 1240 ***
+    // Wavefunction(s) for diagram number 1076
+    // (none)
+    // Amplitude(s) for diagram number 1076
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1076 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1077( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1077 OF 1240 ***
+    // Wavefunction(s) for diagram number 1077
+    // (none)
+    // Amplitude(s) for diagram number 1077
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1077 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1078( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1078 OF 1240 ***
+    // Wavefunction(s) for diagram number 1078
+    // (none)
+    // Amplitude(s) for diagram number 1078
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1078 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1079( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1079 OF 1240 ***
+    // Wavefunction(s) for diagram number 1079
+    // (none)
+    // Amplitude(s) for diagram number 1079
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1079 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1080( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1080 OF 1240 ***
+    // Wavefunction(s) for diagram number 1080
+    // (none)
+    // Amplitude(s) for diagram number 1080
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1080 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1081( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1081 OF 1240 ***
+    // Wavefunction(s) for diagram number 1081
+    // (none)
+    // Amplitude(s) for diagram number 1081
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1081 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1082( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1082 OF 1240 ***
+    // Wavefunction(s) for diagram number 1082
+    // (none)
+    // Amplitude(s) for diagram number 1082
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1082 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1083( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1083 OF 1240 ***
+    // Wavefunction(s) for diagram number 1083
+    // (none)
+    // Amplitude(s) for diagram number 1083
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1084( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1084 OF 1240 ***
+    // Wavefunction(s) for diagram number 1084
+    // (none)
+    // Amplitude(s) for diagram number 1084
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1084 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1085( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1085 OF 1240 ***
+    // Wavefunction(s) for diagram number 1085
+    // (none)
+    // Amplitude(s) for diagram number 1085
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1085 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1086( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1086 OF 1240 ***
+    // Wavefunction(s) for diagram number 1086
+    // (none)
+    // Amplitude(s) for diagram number 1086
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1086 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1087( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1087 OF 1240 ***
+    // Wavefunction(s) for diagram number 1087
+    // (none)
+    // Amplitude(s) for diagram number 1087
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1087 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1088( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1088 OF 1240 ***
+    // Wavefunction(s) for diagram number 1088
+    // (none)
+    // Amplitude(s) for diagram number 1088
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1088 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1089( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1089 OF 1240 ***
+    // Wavefunction(s) for diagram number 1089
+    // (none)
+    // Amplitude(s) for diagram number 1089
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1089 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1090( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1090 OF 1240 ***
+    // Wavefunction(s) for diagram number 1090
+    // (none)
+    // Amplitude(s) for diagram number 1090
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1090 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1091( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1091 OF 1240 ***
+    // Wavefunction(s) for diagram number 1091
+    // (none)
+    // Amplitude(s) for diagram number 1091
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1091 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1092( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1092 OF 1240 ***
+    // Wavefunction(s) for diagram number 1092
+    // (none)
+    // Amplitude(s) for diagram number 1092
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1092 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1093( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1093 OF 1240 ***
+    // Wavefunction(s) for diagram number 1093
+    // (none)
+    // Amplitude(s) for diagram number 1093
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1093 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1094( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1094 OF 1240 ***
+    // Wavefunction(s) for diagram number 1094
+    // (none)
+    // Amplitude(s) for diagram number 1094
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1094 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1095( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1095 OF 1240 ***
+    // Wavefunction(s) for diagram number 1095
+    // (none)
+    // Amplitude(s) for diagram number 1095
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1095 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1096( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1096 OF 1240 ***
+    // Wavefunction(s) for diagram number 1096
+    // (none)
+    // Amplitude(s) for diagram number 1096
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1096 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1097( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1097 OF 1240 ***
+    // Wavefunction(s) for diagram number 1097
+    // (none)
+    // Amplitude(s) for diagram number 1097
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1097 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1098( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1098 OF 1240 ***
+    // Wavefunction(s) for diagram number 1098
+    // (none)
+    // Amplitude(s) for diagram number 1098
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1098 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1099( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1099 OF 1240 ***
+    // Wavefunction(s) for diagram number 1099
+    // (none)
+    // Amplitude(s) for diagram number 1099
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1099 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1100( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1100 OF 1240 ***
+    // Wavefunction(s) for diagram number 1100
+    // (none)
+    // Amplitude(s) for diagram number 1100
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1100 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1101( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1101 OF 1240 ***
+    // Wavefunction(s) for diagram number 1101
+    // (none)
+    // Amplitude(s) for diagram number 1101
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1101 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1102( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1102 OF 1240 ***
+    // Wavefunction(s) for diagram number 1102
+    // (none)
+    // Amplitude(s) for diagram number 1102
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1103( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1103 OF 1240 ***
+    // Wavefunction(s) for diagram number 1103
+    // (none)
+    // Amplitude(s) for diagram number 1103
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1103 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1104( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1104 OF 1240 ***
+    // Wavefunction(s) for diagram number 1104
+    // (none)
+    // Amplitude(s) for diagram number 1104
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1104 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1105( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1105 OF 1240 ***
+    // Wavefunction(s) for diagram number 1105
+    // (none)
+    // Amplitude(s) for diagram number 1105
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1105 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1106( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1106 OF 1240 ***
+    // Wavefunction(s) for diagram number 1106
+    // (none)
+    // Amplitude(s) for diagram number 1106
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1106 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1107( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1107 OF 1240 ***
+    // Wavefunction(s) for diagram number 1107
+    // (none)
+    // Amplitude(s) for diagram number 1107
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1107 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1108( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1108 OF 1240 ***
+    // Wavefunction(s) for diagram number 1108
+    // (none)
+    // Amplitude(s) for diagram number 1108
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1108 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1109( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1109 OF 1240 ***
+    // Wavefunction(s) for diagram number 1109
+    // (none)
+    // Amplitude(s) for diagram number 1109
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1110( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1110 OF 1240 ***
+    // Wavefunction(s) for diagram number 1110
+    // (none)
+    // Amplitude(s) for diagram number 1110
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1110 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1111( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1111 OF 1240 ***
+    // Wavefunction(s) for diagram number 1111
+    // (none)
+    // Amplitude(s) for diagram number 1111
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1111 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1112( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1112 OF 1240 ***
+    // Wavefunction(s) for diagram number 1112
+    // (none)
+    // Amplitude(s) for diagram number 1112
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1112 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1113( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1113 OF 1240 ***
+    // Wavefunction(s) for diagram number 1113
+    // (none)
+    // Amplitude(s) for diagram number 1113
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1113 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1114( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1114 OF 1240 ***
+    // Wavefunction(s) for diagram number 1114
+    // (none)
+    // Amplitude(s) for diagram number 1114
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1114 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1115( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1115 OF 1240 ***
+    // Wavefunction(s) for diagram number 1115
+    // (none)
+    // Amplitude(s) for diagram number 1115
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1115 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1116( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1116 OF 1240 ***
+    // Wavefunction(s) for diagram number 1116
+    // (none)
+    // Amplitude(s) for diagram number 1116
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1117( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1117 OF 1240 ***
+    // Wavefunction(s) for diagram number 1117
+    // (none)
+    // Amplitude(s) for diagram number 1117
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1117 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1118( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1118 OF 1240 ***
+    // Wavefunction(s) for diagram number 1118
+    // (none)
+    // Amplitude(s) for diagram number 1118
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1118 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1119( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1119 OF 1240 ***
+    // Wavefunction(s) for diagram number 1119
+    // (none)
+    // Amplitude(s) for diagram number 1119
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1119 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1120( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1120 OF 1240 ***
+    // Wavefunction(s) for diagram number 1120
+    // (none)
+    // Amplitude(s) for diagram number 1120
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1120 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1121( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1121 OF 1240 ***
+    // Wavefunction(s) for diagram number 1121
+    // (none)
+    // Amplitude(s) for diagram number 1121
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1121 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1122( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1122 OF 1240 ***
+    // Wavefunction(s) for diagram number 1122
+    // (none)
+    // Amplitude(s) for diagram number 1122
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1122 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1123( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1123 OF 1240 ***
+    // Wavefunction(s) for diagram number 1123
+    // (none)
+    // Amplitude(s) for diagram number 1123
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1124( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1124 OF 1240 ***
+    // Wavefunction(s) for diagram number 1124
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] );
+    // Amplitude(s) for diagram number 1124
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1125( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1125 OF 1240 ***
+    // Wavefunction(s) for diagram number 1125
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    // Amplitude(s) for diagram number 1125
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1126( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1126 OF 1240 ***
+    // Wavefunction(s) for diagram number 1126
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 1126
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1127( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1127 OF 1240 ***
+    // Wavefunction(s) for diagram number 1127
+    // (none)
+    // Amplitude(s) for diagram number 1127
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1128( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1128 OF 1240 ***
+    // Wavefunction(s) for diagram number 1128
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+    // Amplitude(s) for diagram number 1128
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1129( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1129 OF 1240 ***
+    // Wavefunction(s) for diagram number 1129
+    // (none)
+    // Amplitude(s) for diagram number 1129
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1130( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1130 OF 1240 ***
+    // Wavefunction(s) for diagram number 1130
+    // (none)
+    // Amplitude(s) for diagram number 1130
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1131( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1131 OF 1240 ***
+    // Wavefunction(s) for diagram number 1131
+    // (none)
+    // Amplitude(s) for diagram number 1131
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1132( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1132 OF 1240 ***
+    // Wavefunction(s) for diagram number 1132
+    // (none)
+    // Amplitude(s) for diagram number 1132
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1133( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1133 OF 1240 ***
+    // Wavefunction(s) for diagram number 1133
+    // (none)
+    // Amplitude(s) for diagram number 1133
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1134( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1134 OF 1240 ***
+    // Wavefunction(s) for diagram number 1134
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    // Amplitude(s) for diagram number 1134
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1135( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1135 OF 1240 ***
+    // Wavefunction(s) for diagram number 1135
+    // (none)
+    // Amplitude(s) for diagram number 1135
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1136( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1136 OF 1240 ***
+    // Wavefunction(s) for diagram number 1136
+    // (none)
+    // Amplitude(s) for diagram number 1136
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1137( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1137 OF 1240 ***
+    // Wavefunction(s) for diagram number 1137
+    // (none)
+    // Amplitude(s) for diagram number 1137
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1138( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1138 OF 1240 ***
+    // Wavefunction(s) for diagram number 1138
+    // (none)
+    // Amplitude(s) for diagram number 1138
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1139( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1139 OF 1240 ***
+    // Wavefunction(s) for diagram number 1139
+    // (none)
+    // Amplitude(s) for diagram number 1139
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1140( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1140 OF 1240 ***
+    // Wavefunction(s) for diagram number 1140
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 1140
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1141( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1141 OF 1240 ***
+    // Wavefunction(s) for diagram number 1141
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 1141
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1142( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1142 OF 1240 ***
+    // Wavefunction(s) for diagram number 1142
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] );
+    // Amplitude(s) for diagram number 1142
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1143( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1143 OF 1240 ***
+    // Wavefunction(s) for diagram number 1143
+    // (none)
+    // Amplitude(s) for diagram number 1143
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1144( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1144 OF 1240 ***
+    // Wavefunction(s) for diagram number 1144
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 1144
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1145( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1145 OF 1240 ***
+    // Wavefunction(s) for diagram number 1145
+    // (none)
+    // Amplitude(s) for diagram number 1145
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1146( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1146 OF 1240 ***
+    // Wavefunction(s) for diagram number 1146
+    // (none)
+    // Amplitude(s) for diagram number 1146
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1147( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1147 OF 1240 ***
+    // Wavefunction(s) for diagram number 1147
+    // (none)
+    // Amplitude(s) for diagram number 1147
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1148( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1148 OF 1240 ***
+    // Wavefunction(s) for diagram number 1148
+    // (none)
+    // Amplitude(s) for diagram number 1148
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1149( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1149 OF 1240 ***
+    // Wavefunction(s) for diagram number 1149
+    // (none)
+    // Amplitude(s) for diagram number 1149
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1150( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1150 OF 1240 ***
+    // Wavefunction(s) for diagram number 1150
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
+    // Amplitude(s) for diagram number 1150
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1151( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1151 OF 1240 ***
+    // Wavefunction(s) for diagram number 1151
+    // (none)
+    // Amplitude(s) for diagram number 1151
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1152( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1152 OF 1240 ***
+    // Wavefunction(s) for diagram number 1152
+    // (none)
+    // Amplitude(s) for diagram number 1152
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1153( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1153 OF 1240 ***
+    // Wavefunction(s) for diagram number 1153
+    // (none)
+    // Amplitude(s) for diagram number 1153
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1154( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1154 OF 1240 ***
+    // Wavefunction(s) for diagram number 1154
+    // (none)
+    // Amplitude(s) for diagram number 1154
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1155( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1155 OF 1240 ***
+    // Wavefunction(s) for diagram number 1155
+    // (none)
+    // Amplitude(s) for diagram number 1155
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1156( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1156 OF 1240 ***
+    // Wavefunction(s) for diagram number 1156
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 1156
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1157( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1157 OF 1240 ***
+    // Wavefunction(s) for diagram number 1157
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 1157
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1158( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1158 OF 1240 ***
+    // Wavefunction(s) for diagram number 1158
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    // Amplitude(s) for diagram number 1158
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1159( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1159 OF 1240 ***
+    // Wavefunction(s) for diagram number 1159
+    // (none)
+    // Amplitude(s) for diagram number 1159
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1160( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1160 OF 1240 ***
+    // Wavefunction(s) for diagram number 1160
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    // Amplitude(s) for diagram number 1160
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1161( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1161 OF 1240 ***
+    // Wavefunction(s) for diagram number 1161
+    // (none)
+    // Amplitude(s) for diagram number 1161
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1162( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1162 OF 1240 ***
+    // Wavefunction(s) for diagram number 1162
+    // (none)
+    // Amplitude(s) for diagram number 1162
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1163( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1163 OF 1240 ***
+    // Wavefunction(s) for diagram number 1163
+    // (none)
+    // Amplitude(s) for diagram number 1163
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1164( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1164 OF 1240 ***
+    // Wavefunction(s) for diagram number 1164
+    // (none)
+    // Amplitude(s) for diagram number 1164
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1165( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1165 OF 1240 ***
+    // Wavefunction(s) for diagram number 1165
+    // (none)
+    // Amplitude(s) for diagram number 1165
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1166( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1166 OF 1240 ***
+    // Wavefunction(s) for diagram number 1166
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+    // Amplitude(s) for diagram number 1166
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1167( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1167 OF 1240 ***
+    // Wavefunction(s) for diagram number 1167
+    // (none)
+    // Amplitude(s) for diagram number 1167
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1168( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1168 OF 1240 ***
+    // Wavefunction(s) for diagram number 1168
+    // (none)
+    // Amplitude(s) for diagram number 1168
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1169( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1169 OF 1240 ***
+    // Wavefunction(s) for diagram number 1169
+    // (none)
+    // Amplitude(s) for diagram number 1169
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1170( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1170 OF 1240 ***
+    // Wavefunction(s) for diagram number 1170
+    // (none)
+    // Amplitude(s) for diagram number 1170
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1171( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1171 OF 1240 ***
+    // Wavefunction(s) for diagram number 1171
+    // (none)
+    // Amplitude(s) for diagram number 1171
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1172( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1172 OF 1240 ***
+    // Wavefunction(s) for diagram number 1172
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 1172
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1173( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1173 OF 1240 ***
+    // Wavefunction(s) for diagram number 1173
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
+    // Amplitude(s) for diagram number 1173
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1174( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1174 OF 1240 ***
+    // Wavefunction(s) for diagram number 1174
+    // (none)
+    // Amplitude(s) for diagram number 1174
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1175( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1175 OF 1240 ***
+    // Wavefunction(s) for diagram number 1175
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 1175
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1176( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1176 OF 1240 ***
+    // Wavefunction(s) for diagram number 1176
+    // (none)
+    // Amplitude(s) for diagram number 1176
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1177( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1177 OF 1240 ***
+    // Wavefunction(s) for diagram number 1177
+    // (none)
+    // Amplitude(s) for diagram number 1177
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1178( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1178 OF 1240 ***
+    // Wavefunction(s) for diagram number 1178
+    // (none)
+    // Amplitude(s) for diagram number 1178
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1179( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1179 OF 1240 ***
+    // Wavefunction(s) for diagram number 1179
+    // (none)
+    // Amplitude(s) for diagram number 1179
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1180( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1180 OF 1240 ***
+    // Wavefunction(s) for diagram number 1180
+    // (none)
+    // Amplitude(s) for diagram number 1180
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1181( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1181 OF 1240 ***
+    // Wavefunction(s) for diagram number 1181
+    // (none)
+    // Amplitude(s) for diagram number 1181
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1182( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1182 OF 1240 ***
+    // Wavefunction(s) for diagram number 1182
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 1182
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1183( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1183 OF 1240 ***
+    // Wavefunction(s) for diagram number 1183
+    // (none)
+    // Amplitude(s) for diagram number 1183
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1184( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1184 OF 1240 ***
+    // Wavefunction(s) for diagram number 1184
+    // (none)
+    // Amplitude(s) for diagram number 1184
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1185( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1185 OF 1240 ***
+    // Wavefunction(s) for diagram number 1185
+    // (none)
+    // Amplitude(s) for diagram number 1185
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1186( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1186 OF 1240 ***
+    // Wavefunction(s) for diagram number 1186
+    // (none)
+    // Amplitude(s) for diagram number 1186
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1187( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1187 OF 1240 ***
+    // Wavefunction(s) for diagram number 1187
+    // (none)
+    // Amplitude(s) for diagram number 1187
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1188( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1188 OF 1240 ***
+    // Wavefunction(s) for diagram number 1188
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
+    // Amplitude(s) for diagram number 1188
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1189( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1189 OF 1240 ***
+    // Wavefunction(s) for diagram number 1189
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 1189
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1190( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1190 OF 1240 ***
+    // Wavefunction(s) for diagram number 1190
+    // (none)
+    // Amplitude(s) for diagram number 1190
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1191( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1191 OF 1240 ***
+    // Wavefunction(s) for diagram number 1191
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 1191
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1192( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1192 OF 1240 ***
+    // Wavefunction(s) for diagram number 1192
+    // (none)
+    // Amplitude(s) for diagram number 1192
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1193( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1193 OF 1240 ***
+    // Wavefunction(s) for diagram number 1193
+    // (none)
+    // Amplitude(s) for diagram number 1193
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1194( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1194 OF 1240 ***
+    // Wavefunction(s) for diagram number 1194
+    // (none)
+    // Amplitude(s) for diagram number 1194
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1195( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1195 OF 1240 ***
+    // Wavefunction(s) for diagram number 1195
+    // (none)
+    // Amplitude(s) for diagram number 1195
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1196( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1196 OF 1240 ***
+    // Wavefunction(s) for diagram number 1196
+    // (none)
+    // Amplitude(s) for diagram number 1196
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1197( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1197 OF 1240 ***
+    // Wavefunction(s) for diagram number 1197
+    // (none)
+    // Amplitude(s) for diagram number 1197
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1198( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1198 OF 1240 ***
+    // Wavefunction(s) for diagram number 1198
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    // Amplitude(s) for diagram number 1198
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1199( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1199 OF 1240 ***
+    // Wavefunction(s) for diagram number 1199
+    // (none)
+    // Amplitude(s) for diagram number 1199
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1200( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1200 OF 1240 ***
+    // Wavefunction(s) for diagram number 1200
+    // (none)
+    // Amplitude(s) for diagram number 1200
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1201( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1201 OF 1240 ***
+    // Wavefunction(s) for diagram number 1201
+    // (none)
+    // Amplitude(s) for diagram number 1201
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1202( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1202 OF 1240 ***
+    // Wavefunction(s) for diagram number 1202
+    // (none)
+    // Amplitude(s) for diagram number 1202
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1203( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1203 OF 1240 ***
+    // Wavefunction(s) for diagram number 1203
+    // (none)
+    // Amplitude(s) for diagram number 1203
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1204( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1204 OF 1240 ***
+    // Wavefunction(s) for diagram number 1204
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
+    // Amplitude(s) for diagram number 1204
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1205( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1205 OF 1240 ***
+    // Wavefunction(s) for diagram number 1205
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 1205
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1206( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1206 OF 1240 ***
+    // Wavefunction(s) for diagram number 1206
+    // (none)
+    // Amplitude(s) for diagram number 1206
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1207( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1207 OF 1240 ***
+    // Wavefunction(s) for diagram number 1207
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+    // Amplitude(s) for diagram number 1207
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1208( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1208 OF 1240 ***
+    // Wavefunction(s) for diagram number 1208
+    // (none)
+    // Amplitude(s) for diagram number 1208
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1209( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1209 OF 1240 ***
+    // Wavefunction(s) for diagram number 1209
+    // (none)
+    // Amplitude(s) for diagram number 1209
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1210( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1210 OF 1240 ***
+    // Wavefunction(s) for diagram number 1210
+    // (none)
+    // Amplitude(s) for diagram number 1210
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1211( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1211 OF 1240 ***
+    // Wavefunction(s) for diagram number 1211
+    // (none)
+    // Amplitude(s) for diagram number 1211
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1212( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1212 OF 1240 ***
+    // Wavefunction(s) for diagram number 1212
+    // (none)
+    // Amplitude(s) for diagram number 1212
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1213( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1213 OF 1240 ***
+    // Wavefunction(s) for diagram number 1213
+    // (none)
+    // Amplitude(s) for diagram number 1213
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1214( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1214 OF 1240 ***
+    // Wavefunction(s) for diagram number 1214
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 1214
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1215( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1215 OF 1240 ***
+    // Wavefunction(s) for diagram number 1215
+    // (none)
+    // Amplitude(s) for diagram number 1215
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1216( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1216 OF 1240 ***
+    // Wavefunction(s) for diagram number 1216
+    // (none)
+    // Amplitude(s) for diagram number 1216
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1217( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1217 OF 1240 ***
+    // Wavefunction(s) for diagram number 1217
+    // (none)
+    // Amplitude(s) for diagram number 1217
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1218( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1218 OF 1240 ***
+    // Wavefunction(s) for diagram number 1218
+    // (none)
+    // Amplitude(s) for diagram number 1218
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1219( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1219 OF 1240 ***
+    // Wavefunction(s) for diagram number 1219
+    // (none)
+    // Amplitude(s) for diagram number 1219
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1220( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1220 OF 1240 ***
+    // Wavefunction(s) for diagram number 1220
+    // (none)
+    // Amplitude(s) for diagram number 1220
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1221( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1221 OF 1240 ***
+    // Wavefunction(s) for diagram number 1221
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 1221
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1222( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1222 OF 1240 ***
+    // Wavefunction(s) for diagram number 1222
+    // (none)
+    // Amplitude(s) for diagram number 1222
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1223( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1223 OF 1240 ***
+    // Wavefunction(s) for diagram number 1223
+    // (none)
+    // Amplitude(s) for diagram number 1223
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1224( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1224 OF 1240 ***
+    // Wavefunction(s) for diagram number 1224
+    // (none)
+    // Amplitude(s) for diagram number 1224
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1225( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1225 OF 1240 ***
+    // Wavefunction(s) for diagram number 1225
+    // (none)
+    // Amplitude(s) for diagram number 1225
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1226( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1226 OF 1240 ***
+    // Wavefunction(s) for diagram number 1226
+    // (none)
+    // Amplitude(s) for diagram number 1226
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1227( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1227 OF 1240 ***
+    // Wavefunction(s) for diagram number 1227
+    // (none)
+    // Amplitude(s) for diagram number 1227
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1228( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1228 OF 1240 ***
+    // Wavefunction(s) for diagram number 1228
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] );
+    // Amplitude(s) for diagram number 1228
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1229( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1229 OF 1240 ***
+    // Wavefunction(s) for diagram number 1229
+    // (none)
+    // Amplitude(s) for diagram number 1229
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1230( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1230 OF 1240 ***
+    // Wavefunction(s) for diagram number 1230
+    // (none)
+    // Amplitude(s) for diagram number 1230
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1231( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1231 OF 1240 ***
+    // Wavefunction(s) for diagram number 1231
+    // (none)
+    // Amplitude(s) for diagram number 1231
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1232( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1232 OF 1240 ***
+    // Wavefunction(s) for diagram number 1232
+    // (none)
+    // Amplitude(s) for diagram number 1232
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1233( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1233 OF 1240 ***
+    // Wavefunction(s) for diagram number 1233
+    // (none)
+    // Amplitude(s) for diagram number 1233
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1234( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1234 OF 1240 ***
+    // Wavefunction(s) for diagram number 1234
+    // (none)
+    // Amplitude(s) for diagram number 1234
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1235( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1235 OF 1240 ***
+    // Wavefunction(s) for diagram number 1235
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] );
+    // Amplitude(s) for diagram number 1235
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1236( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1236 OF 1240 ***
+    // Wavefunction(s) for diagram number 1236
+    // (none)
+    // Amplitude(s) for diagram number 1236
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1237( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1237 OF 1240 ***
+    // Wavefunction(s) for diagram number 1237
+    // (none)
+    // Amplitude(s) for diagram number 1237
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1238( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1238 OF 1240 ***
+    // Wavefunction(s) for diagram number 1238
+    // (none)
+    // Amplitude(s) for diagram number 1238
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1239( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1239 OF 1240 ***
+    // Wavefunction(s) for diagram number 1239
+    // (none)
+    // Amplitude(s) for diagram number 1239
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1240( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1240 OF 1240 ***
+    // Wavefunction(s) for diagram number 1240
+    // (none)
+    // Amplitude(s) for diagram number 1240
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f
index 3671cdce55..def489179c 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
index 07ccd4d1a4..ac98d845bd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -419,7 +419,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -462,7 +462,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(3030)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -505,9375 +506,738 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,  1),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  1),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  1),I= 19, 24) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,  1),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  1),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  1),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  1),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  1),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  1),I= 55, 60) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  1),I= 61, 66) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  1),I= 67, 72) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  1),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  1),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  1),I= 85, 90) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  1),I= 91, 96) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  1),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  1),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  1),I=109,114) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  1),I=115,120) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
+      DATA DENOM/324/
+      DATA (CF(I),I=  1,120) /4096,-1024,-1024,128,128,1280,-1024,128
+     $ ,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160,992
+     $ ,992,-448,-1024,128,128,-16,-16,-160,128,-16,-16,2,2,20,-16,2,
+     $ -160,20,-142,-124,2,20,20,-124,-124,56,128,-16,-16,2,2,20,1280,
+     $ -160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124,
+     $ -106,-268,-88,-16,2,-160,20,-142,-124,-160,20,992,-124,38,-106
+     $ ,992,-124,-448,56,-268,-88,1010,-268,-268,884,884,-232,2,20,20,
+     $ -124,-124,56,20,200,-124,1028,-106,-88,-124,-106,56,-88,884,
+     $ -232,1028,-88,-88,-232,-232,272/
 C     1 T(1,2,5,6,7,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,  2),I=  7, 12) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  2),I= 13, 18) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,  2),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  2),I= 25, 30) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  2),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  2),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  2),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  2),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  2),I= 55, 60) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  2),I= 61, 66) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  2),I= 67, 72) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  2),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  2),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  2),I= 85, 90) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  2),I= 91, 96) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  2),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  2),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  2),I=109,114) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  2),I=115,120) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=121,239) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,128,-1024,-16,-160,128,-16,-16,128,2,20,-16,2,2,20,20,-124
+     $ ,-124,56,-16,2,-160,20,-142,-124,-16,128,2,20,-16,2,-160,1280
+     $ ,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124
+     $ ,1010,1028,2,20,20,-124,-124,56,20,200,-124,1028,-106,-88,-124,
+     $ -106,56,-88,884,-232,1028,-88,-88,-232,-232,272,-16,2,-160,20,
+     $ -142,-124,-160,20,992,-124,38,-106,992,-124,-448,56,-268,-88
+     $ ,1010,-268,-268,884,884,-232/
 C     1 T(1,2,5,7,6,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,  3),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  3),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  3),I= 19, 24) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  3),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  3),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  3),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  3),I= 61, 66) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  3),I= 67, 72) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  3),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  3),I= 79, 84) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  3),I= 85, 90) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  3),I= 91, 96) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  3),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  3),I=103,108) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  3),I=109,114) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  3),I=115,120) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=240,357) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142,-124,128,-16,-16
+     $ ,2,2,20,20,2,-124,56,20,-124,-16,2,-160,20,-142,-124,-160,20
+     $ ,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268,-268,884
+     $ ,884,-232,128,-16,-16,2,2,20,1280,-160,-160,20,20,200,1136,-142
+     $ ,992,-124,1010,1028,-142,38,-124,-106,-268,-88,20,2,-124,56,20,
+     $ -124,200,20,-106,-88,-124,1028,1028,-88,-88,-232,-232,272,-124,
+     $ -106,56,-88,884,-232/
 C     1 T(1,2,6,5,7,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,  4),I=  7, 12) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,  4),I= 13, 18) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  4),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,  4),I= 25, 30) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,  4),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  4),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  4),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  4),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  4),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  4),I= 61, 66) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  4),I= 67, 72) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  4),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  4),I= 79, 84) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  4),I= 85, 90) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  4),I= 91, 96) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  4),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  4),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  4),I=109,114) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  4),I=115,120) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=358,474) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-16,
+     $ -160,128,-1024,-16,128,2,20,20,-124,-124,56,-16,128,2,20,-16,2
+     $ ,2,-16,-142,-124,-160,20,2,20,20,-124,-124,56,20,200,-124,1028,
+     $ -106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232,-232,272,
+     $ -16,128,2,20,-16,2,-160,1280,20,200,-160,20,-142,38,-124,-106,
+     $ -268,-88,1136,-142,992,-124,1010,1028,2,-16,-142,-124,-160,20
+     $ ,20,-160,38,-106,992,-124,1010,-268,-268,884,884,-232,992,-124,
+     $ -448,56,-268,-88/
 C     1 T(1,2,6,7,5,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,  5),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,  5),I= 13, 18) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  5),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  5),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,  5),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  5),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  5),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  5),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  5),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  5),I= 61, 66) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  5),I= 67, 72) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  5),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  5),I= 79, 84) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  5),I= 85, 90) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  5),I= 91, 96) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  5),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  5),I=103,108) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  5),I=109,114) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  5),I=115,120) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=475,590) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-16,128,
+     $ -160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2,-124,56,20,-124
+     $ ,128,-16,-16,2,2,20,2,-16,-142,-124,-160,20,20,-160,38,-106,992
+     $ ,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268,-88,20
+     $ ,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028,-88,-88,-232,
+     $ -232,272,-124,-106,56,-88,884,-232,128,-16,-16,2,2,20,1280,-160
+     $ ,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,-124,-106,
+     $ -268,-88/
 C     1 T(1,2,7,5,6,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,  6),I=  7, 12) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  6),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,  6),I= 19, 24) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  6),I= 25, 30) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,  6),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  6),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  6),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  6),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  6),I= 55, 60) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  6),I= 61, 66) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  6),I= 67, 72) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  6),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,  6),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  6),I= 85, 90) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  6),I= 91, 96) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  6),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  6),I=103,108) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  6),I=109,114) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  6),I=115,120) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
+      DATA (CF(I),I=591,705) /4096,-160,-16,992,-448,-160,992,-16,128
+     $ ,1136,992,1280,-160,128,-1024,-16,-160,128,-16,-160,-16,-16,128
+     $ ,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124,-160,20,-16,128
+     $ ,2,20,-16,2,20,2,-124,56,20,-124,200,20,-106,-88,-124,1028,1028
+     $ ,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232,2,-16,-142,
+     $ -124,-160,20,20,-160,38,-106,992,-124,1010,-268,-268,884,884,
+     $ -232,992,-124,-448,56,-268,-88,-16,128,2,20,-16,2,-160,1280,20
+     $ ,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142,992,-124,1010
+     $ ,1028/
 C     1 T(1,2,7,6,5,3,4)
-      DATA (CF(I,  7),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  7),I=  7, 12) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,  7),I= 13, 18) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,  7),I= 19, 24) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  7),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  7),I= 31, 36) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  7),I= 37, 42) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  7),I= 43, 48) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  7),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  7),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  7),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  7),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  7),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,  7),I= 79, 84) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  7),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  7),I= 91, 96) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I,  7),I= 97,102) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,  7),I=103,108) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  7),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  7),I=115,120) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=706,819) /4096,-1024,-1024,128,128,1280,1280,-160
+     $ ,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2,2,20
+     $ ,1280,-160,-160,20,20,200,1136,-142,992,-124,1010,1028,-142,38,
+     $ -124,-106,-268,-88,-1024,128,128,-16,-16,-160,128,-16,-16,2,2
+     $ ,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124,56,-160,20,-16,2,
+     $ -124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124,38,-106,
+     $ -268,1010,884,-232,-268,884,20,-124,2,20,56,-124,-124,-106,56,
+     $ -88,884,-232,20,200,-124,1028,-106,-88,-88,1028,-232,272,-88,
+     $ -232/
 C     1 T(1,5,2,6,7,3,4)
-      DATA (CF(I,  8),I=  1,  6) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  8),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,  8),I= 13, 18) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,  8),I= 19, 24) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,  8),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  8),I= 31, 36) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I,  8),I= 37, 42) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  8),I= 43, 48) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  8),I= 49, 54) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  8),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,  8),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,  8),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  8),I= 73, 78) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,  8),I= 79, 84) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  8),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  8),I= 91, 96) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  8),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,  8),I=103,108) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  8),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  8),I=115,120) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
+      DATA (CF(I),I=820,932) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2,
+     $ -160,1280,20,200,-160,20,-142,38,-124,-106,-268,-88,1136,-142
+     $ ,992,-124,1010,1028,128,-1024,-16,-160,128,-16,-16,128,2,20,-16
+     $ ,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20,-124,2,20,56
+     $ ,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88,-88
+     $ ,1028,-232,272,-88,-232,-160,20,-16,2,-124,-142,992,-124,-448
+     $ ,56,-268,-88,-160,20,992,-124,38,-106,-268,1010,884,-232,-268
+     $ ,884/
 C     1 T(1,5,2,7,6,3,4)
-      DATA (CF(I,  9),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,  9),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,  9),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  9),I= 19, 24) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,  9),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  9),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  9),I= 37, 42) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,  9),I= 43, 48) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I,  9),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,  9),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  9),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,  9),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,  9),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,  9),I= 79, 84) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I,  9),I= 85, 90) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,  9),I= 91, 96) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,  9),I= 97,102) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,  9),I=103,108) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I,  9),I=109,114) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,  9),I=115,120) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=933,1044) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124,
+     $ -160,20,992,-124,38,-106,992,-124,-448,56,-268,-88,1010,-268,
+     $ -268,884,884,-232,128,-16,-1024,128,-160,-16,-16,2,-160,20,-142
+     $ ,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,-16,2,128,-16,20
+     $ ,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38,
+     $ -142,-268,-88,-124,-106,-124,56,20,2,-124,20,1028,-88,-88,-232,
+     $ -232,272,200,20,-106,-88,-124,1028,-106,-124,884,-232,56,-88/
 C     1 T(1,5,6,2,7,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 10),I=  7, 12) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 10),I= 13, 18) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 10),I= 19, 24) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 10),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 10),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 10),I= 37, 42) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 10),I= 43, 48) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 10),I= 49, 54) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 10),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 10),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 10),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 10),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 10),I= 79, 84) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 10),I= 85, 90) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 10),I= 91, 96) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 10),I= 97,102) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 10),I=103,108) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 10),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 10),I=115,120) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=1045,1155) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,20,200,
+     $ -124,1028,-106,-88,-124,-106,56,-88,884,-232,1028,-88,-88,-232,
+     $ -232,272,-16,-160,128,-1024,-16,128,2,20,20,-124,-124,56,-16
+     $ ,128,2,20,-16,2,2,-16,-142,-124,-160,20,2,20,-16,128,2,-16,-142
+     $ ,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136,1010
+     $ ,1028,992,-124,-142,-124,2,-16,20,-160,1010,-268,-268,884,884,
+     $ -232,20,-160,38,-106,992,-124,-124,992,-268,-88,-448,56/
 C     1 T(1,5,6,7,2,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 11),I=  7, 12) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 11),I= 13, 18) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 11),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 11),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 11),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 11),I= 37, 42) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 11),I= 43, 48) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 11),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 11),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 11),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 11),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 11),I= 73, 78) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 11),I= 79, 84) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 11),I= 85, 90) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 11),I= 91, 96) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 11),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 11),I=103,108) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 11),I=109,114) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 11),I=115,120) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
+      DATA (CF(I),I=1156,1265) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,20,-160,38,
+     $ -106,992,-124,1010,-268,-268,884,884,-232,992,-124,-448,56,-268
+     $ ,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124,-160,20,20,2,
+     $ -124,56,20,-124,128,-16,-16,2,2,20,-124,56,20,2,-124,20,1028,
+     $ -88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124,884,
+     $ -232,56,-88,-16,2,128,-16,20,2,1136,-142,992,-124,1010,1028
+     $ ,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106/
 C     1 T(1,5,7,2,6,3,4)
-      DATA (CF(I, 12),I=  1,  6) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 12),I=  7, 12) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 12),I= 13, 18) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 12),I= 19, 24) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 12),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 12),I= 31, 36) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 12),I= 37, 42) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 12),I= 43, 48) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 12),I= 49, 54) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 12),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 12),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 12),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 12),I= 73, 78) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 12),I= 79, 84) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 12),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 12),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 12),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 12),I=103,108) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 12),I=109,114) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 12),I=115,120) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
+      DATA (CF(I),I=1266,1374) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,200,20,-106,-88,
+     $ -124,1028,1028,-88,-88,-232,-232,272,-124,-106,56,-88,884,-232,
+     $ -160,-16,-16,128,128,-1024,20,2,-124,56,20,-124,2,-16,-142,-124
+     $ ,-160,20,-16,128,2,20,-16,2,-142,-124,2,-16,20,-160,1010,-268,
+     $ -268,884,884,-232,20,-160,38,-106,992,-124,-124,992,-268,-88,
+     $ -448,56,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88,-160,1280
+     $ ,20,200,-160,20,-142,1136,1010,1028,992,-124/
 C     1 T(1,5,7,6,2,3,4)
-      DATA (CF(I, 13),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 13),I=  7, 12) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 13),I= 13, 18) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 13),I= 19, 24) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 13),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 13),I= 31, 36) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 13),I= 37, 42) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 13),I= 43, 48) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 13),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 13),I= 55, 60) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 13),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 13),I= 67, 72) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 13),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 13),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 13),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 13),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 13),I= 97,102) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 13),I=103,108) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 13),I=109,114) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 13),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=1375,1482) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,-16,2,128,-16,20,2,1136,-142,992,-124,1010
+     $ ,1028,1280,-160,-160,20,20,200,38,-142,-268,-88,-124,-106,-160
+     $ ,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160,20,992,-124
+     $ ,38,-106,-268,1010,884,-232,-268,884,-1024,128,128,-16,-16,-160
+     $ ,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20,20,-124,-124
+     $ ,56,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88,1028,
+     $ -232,272,-88,-232,20,200,-124,1028,-106,-88/
 C     1 T(1,6,2,5,7,3,4)
-      DATA (CF(I, 14),I=  1,  6) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 14),I=  7, 12) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 14),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 14),I= 19, 24) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 14),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 14),I= 31, 36) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 14),I= 37, 42) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 14),I= 43, 48) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 14),I= 49, 54) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 14),I= 55, 60) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 14),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 14),I= 67, 72) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 14),I= 73, 78) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 14),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 14),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 14),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 14),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 14),I=103,108) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 14),I=109,114) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 14),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
+      DATA (CF(I),I=1483,1589) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,2,20,-16,128,2,-16,-142,38,-124,-106,-268,-88,
+     $ -160,1280,20,200,-160,20,-142,1136,1010,1028,992,-124,20,-124,2
+     $ ,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124,1028,-106,-88
+     $ ,-88,1028,-232,272,-88,-232,128,-1024,-16,-160,128,-16,-16,128
+     $ ,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20,-142,-124,20,
+     $ -160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010,884,
+     $ -232,-268,884,-160,20,992,-124,38,-106/
 C     1 T(1,6,2,7,5,3,4)
-      DATA (CF(I, 15),I=  1,  6) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 15),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 15),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 15),I= 19, 24) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 15),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 15),I= 31, 36) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 15),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 15),I= 43, 48) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 15),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 15),I= 55, 60) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 15),I= 61, 66) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 15),I= 67, 72) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 15),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 15),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 15),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 15),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 15),I= 97,102) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 15),I=103,108) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 15),I=109,114) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 15),I=115,120) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
+      DATA (CF(I),I=1590,1695) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,-160,20,-16,2,-124,-142,992,-124,-448,56,-268,-88,-160
+     $ ,20,992,-124,38,-106,-268,1010,884,-232,-268,884,-16,2,128,-16
+     $ ,20,2,1136,-142,992,-124,1010,1028,1280,-160,-160,20,20,200,38,
+     $ -142,-268,-88,-124,-106,128,-16,-1024,128,-160,-16,-16,2,-160
+     $ ,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20,-124,56,-124,
+     $ -124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884,-232,56,
+     $ -88,200,20,-106,-88,-124,1028/
 C     1 T(1,6,5,2,7,3,4)
-      DATA (CF(I, 16),I=  1,  6) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 16),I=  7, 12) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 16),I= 13, 18) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 16),I= 19, 24) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 16),I= 25, 30) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 16),I= 31, 36) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 16),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 16),I= 43, 48) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 16),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 16),I= 55, 60) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 16),I= 61, 66) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 16),I= 67, 72) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 16),I= 73, 78) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 16),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 16),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 16),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 16),I= 97,102) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 16),I=103,108) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 16),I=109,114) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 16),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=1696,1800) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,20,-124,2,20,56,-124,-124,-106,56,-88,884,-232,20,200,-124
+     $ ,1028,-106,-88,-88,1028,-232,272,-88,-232,2,20,-16,128,2,-16,
+     $ -142,38,-124,-106,-268,-88,-160,1280,20,200,-160,20,-142,1136
+     $ ,1010,1028,992,-124,-16,-160,128,-1024,-16,128,2,20,20,-124,
+     $ -124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160,20,-124,-142,20
+     $ ,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268,-88,-448
+     $ ,56,20,-160,38,-106,992,-124/
 C     1 T(1,6,5,7,2,3,4)
-      DATA (CF(I, 17),I=  1,  6) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 17),I=  7, 12) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 17),I= 13, 18) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 17),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 17),I= 25, 30) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 17),I= 31, 36) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 17),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 17),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 17),I= 49, 54) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 17),I= 55, 60) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 17),I= 61, 66) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 17),I= 67, 72) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 17),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 17),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 17),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 17),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 17),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 17),I=103,108) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 17),I=109,114) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 17),I=115,120) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
+      DATA (CF(I),I=1801,1904) /4096,-1024,-16,128,-160,-16,-1024,128,
+     $ -142,-124,2,-16,20,-160,1010,-268,-268,884,884,-232,20,-160,38,
+     $ -106,992,-124,-124,992,-268,-88,-448,56,-124,56,20,2,-124,20
+     $ ,1028,-88,-88,-232,-232,272,200,20,-106,-88,-124,1028,-106,-124
+     $ ,884,-232,56,-88,-16,128,-160,-16,-1024,128,2,-16,-142,-124,
+     $ -160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20,2,-16,20,2,128,
+     $ -16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124,-106
+     $ ,1280,-160,-160,20,20,200/
 C     1 T(1,6,7,2,5,3,4)
-      DATA (CF(I, 18),I=  1,  6) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 18),I=  7, 12) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 18),I= 13, 18) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 18),I= 19, 24) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 18),I= 25, 30) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 18),I= 31, 36) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 18),I= 37, 42) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 18),I= 43, 48) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 18),I= 49, 54) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 18),I= 55, 60) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 18),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 18),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 18),I= 73, 78) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 18),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 18),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 18),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 18),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 18),I=103,108) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 18),I=109,114) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 18),I=115,120) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
+      DATA (CF(I),I=1905,2007) /4096,-160,-16,-16,128,128,-1024,-124
+     $ ,56,20,2,-124,20,1028,-88,-88,-232,-232,272,200,20,-106,-88,
+     $ -124,1028,-106,-124,884,-232,56,-88,-142,-124,2,-16,20,-160
+     $ ,1010,-268,-268,884,884,-232,20,-160,38,-106,992,-124,-124,992,
+     $ -268,-88,-448,56,-160,-16,-16,128,128,-1024,20,2,-124,56,20,
+     $ -124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2,20,2,2,-16,-16
+     $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124,
+     $ -160,1280,20,200,-160,20/
 C     1 T(1,6,7,5,2,3,4)
-      DATA (CF(I, 19),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 19),I=  7, 12) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 19),I= 13, 18) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 19),I= 19, 24) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 19),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 19),I= 31, 36) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 19),I= 37, 42) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 19),I= 43, 48) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 19),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 19),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 19),I= 61, 66) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 19),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 19),I= 73, 78) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 19),I= 79, 84) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 19),I= 85, 90) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 19),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 19),I= 97,102) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 19),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 19),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 19),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=2008,2109) /4096,-1024,-1024,128,128,1280,2,-16,20
+     $ ,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,-124,
+     $ -106,1280,-160,-160,20,20,200,20,-160,-124,-142,-16,2,-124,992,
+     $ -268,-88,-448,56,-268,1010,884,-232,-268,884,-160,20,992,-124
+     $ ,38,-106,-124,20,56,-124,2,20,-106,-124,884,-232,56,-88,-88
+     $ ,1028,-232,272,-88,-232,20,200,-124,1028,-106,-88,-1024,128,128
+     $ ,-16,-16,-160,128,-16,-16,2,2,20,-16,2,-160,20,-142,-124,2,20
+     $ ,20,-124,-124,56/
 C     1 T(1,7,2,5,6,3,4)
-      DATA (CF(I, 20),I=  1,  6) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 20),I=  7, 12) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 20),I= 13, 18) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 20),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 20),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 20),I= 31, 36) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 20),I= 37, 42) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 20),I= 43, 48) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 20),I= 49, 54) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 20),I= 55, 60) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 20),I= 61, 66) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 20),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 20),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 20),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 20),I= 85, 90) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 20),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 20),I= 97,102) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 20),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 20),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 20),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=2110,2210) /4096,128,1280,-1024,128,20,2,2,-16,-16
+     $ ,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124,
+     $ -160,1280,20,200,-160,20,-124,20,56,-124,2,20,-106,-124,884,
+     $ -232,56,-88,-88,1028,-232,272,-88,-232,20,200,-124,1028,-106,
+     $ -88,20,-160,-124,-142,-16,2,-124,992,-268,-88,-448,56,-268,1010
+     $ ,884,-232,-268,884,-160,20,992,-124,38,-106,128,-1024,-16,-160
+     $ ,128,-16,-16,128,2,20,-16,2,2,20,20,-124,-124,56,-16,2,-160,20,
+     $ -142,-124/
 C     1 T(1,7,2,6,5,3,4)
-      DATA (CF(I, 21),I=  1,  6) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 21),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 21),I= 13, 18) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 21),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 21),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 21),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 21),I= 37, 42) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 21),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 21),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 21),I= 55, 60) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 21),I= 61, 66) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 21),I= 67, 72) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 21),I= 73, 78) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 21),I= 79, 84) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 21),I= 85, 90) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 21),I= 91, 96) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 21),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 21),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 21),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 21),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=2211,2310) /4096,-1024,1280,128,20,-160,-124,-142,
+     $ -16,2,-124,992,-268,-88,-448,56,-268,1010,884,-232,-268,884,
+     $ -160,20,992,-124,38,-106,2,-16,20,2,128,-16,-142,1136,1010,1028
+     $ ,992,-124,38,-142,-268,-88,-124,-106,1280,-160,-160,20,20,200
+     $ ,56,-124,-124,20,20,2,-88,1028,-232,272,-88,-232,-106,-124,884,
+     $ -232,56,-88,200,20,-106,-88,-124,1028,128,-16,-1024,128,-160,
+     $ -16,-16,2,-160,20,-142,-124,128,-16,-16,2,2,20,20,2,-124,56,20,
+     $ -124/
 C     1 T(1,7,5,2,6,3,4)
-      DATA (CF(I, 22),I=  1,  6) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 22),I=  7, 12) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 22),I= 13, 18) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 22),I= 19, 24) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 22),I= 25, 30) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 22),I= 31, 36) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 22),I= 37, 42) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 22),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 22),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 22),I= 55, 60) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 22),I= 61, 66) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 22),I= 67, 72) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 22),I= 73, 78) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 22),I= 79, 84) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 22),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 22),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 22),I= 97,102) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 22),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 22),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 22),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
+      DATA (CF(I),I=2311,2409) /4096,128,-1024,-124,20,56,-124,2,20,
+     $ -106,-124,884,-232,56,-88,-88,1028,-232,272,-88,-232,20,200,
+     $ -124,1028,-106,-88,20,2,2,-16,-16,128,38,-142,-268,-88,-124,
+     $ -106,-142,1136,1010,1028,992,-124,-160,1280,20,200,-160,20,-124
+     $ ,-142,20,-160,2,-16,-268,1010,884,-232,-268,884,-124,992,-268,
+     $ -88,-448,56,20,-160,38,-106,992,-124,-16,-160,128,-1024,-16,128
+     $ ,2,20,20,-124,-124,56,-16,128,2,20,-16,2,2,-16,-142,-124,-160
+     $ ,20/
 C     1 T(1,7,5,6,2,3,4)
-      DATA (CF(I, 23),I=  1,  6) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 23),I=  7, 12) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 23),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 23),I= 19, 24) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 23),I= 25, 30) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 23),I= 31, 36) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 23),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 23),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 23),I= 49, 54) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 23),I= 55, 60) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 23),I= 61, 66) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 23),I= 67, 72) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 23),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 23),I= 79, 84) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 23),I= 85, 90) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 23),I= 91, 96) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 23),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 23),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 23),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 23),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=2410,2507) /4096,-1024,-124,-142,20,-160,2,-16,
+     $ -268,1010,884,-232,-268,884,-124,992,-268,-88,-448,56,20,-160
+     $ ,38,-106,992,-124,56,-124,-124,20,20,2,-88,1028,-232,272,-88,
+     $ -232,-106,-124,884,-232,56,-88,200,20,-106,-88,-124,1028,2,-16
+     $ ,20,2,128,-16,-142,1136,1010,1028,992,-124,38,-142,-268,-88,
+     $ -124,-106,1280,-160,-160,20,20,200,-16,128,-160,-16,-1024,128,2
+     $ ,-16,-142,-124,-160,20,20,2,-124,56,20,-124,128,-16,-16,2,2,20/
 C     1 T(1,7,6,2,5,3,4)
-      DATA (CF(I, 24),I=  1,  6) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 24),I=  7, 12) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 24),I= 13, 18) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 24),I= 19, 24) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 24),I= 25, 30) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 24),I= 31, 36) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 24),I= 37, 42) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 24),I= 43, 48) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 24),I= 49, 54) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 24),I= 55, 60) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 24),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 24),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 24),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 24),I= 79, 84) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 24),I= 85, 90) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 24),I= 91, 96) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 24),I= 97,102) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 24),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 24),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 24),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
+      DATA (CF(I),I=2508,2604) /4096,56,-124,-124,20,20,2,-88,1028,
+     $ -232,272,-88,-232,-106,-124,884,-232,56,-88,200,20,-106,-88,
+     $ -124,1028,-124,-142,20,-160,2,-16,-268,1010,884,-232,-268,884,
+     $ -124,992,-268,-88,-448,56,20,-160,38,-106,992,-124,20,2,2,-16,
+     $ -16,128,38,-142,-268,-88,-124,-106,-142,1136,1010,1028,992,-124
+     $ ,-160,1280,20,200,-160,20,-160,-16,-16,128,128,-1024,20,2,-124
+     $ ,56,20,-124,2,-16,-142,-124,-160,20,-16,128,2,20,-16,2/
 C     1 T(1,7,6,5,2,3,4)
-      DATA (CF(I, 25),I=  1,  6) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 25),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 25),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 25),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 25),I= 25, 30) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 25),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 25),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 25),I= 43, 48) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 25),I= 49, 54) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 25),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 25),I= 61, 66) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 25),I= 67, 72) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 25),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 25),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 25),I= 85, 90) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 25),I= 91, 96) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 25),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 25),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 25),I=109,114) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 25),I=115,120) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
+      DATA (CF(I),I=2605,2700) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992,
+     $ -124,1136,-142,1028,1010,-124,-106,-142,38,-88,-268,-160,20,992
+     $ ,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992,-124,-88,-268
+     $ ,-268,884,1010,-268,-232,884,20,200,-124,1028,-106,-88,2,20,20,
+     $ -124,-124,56,56,-88,-124,-106,-232,884,-88,-232,1028,-88,272,
+     $ -232/
 C     1 T(2,1,5,6,7,3,4)
-      DATA (CF(I, 26),I=  1,  6) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 26),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 26),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 26),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 26),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 26),I= 31, 36) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 26),I= 37, 42) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 26),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 26),I= 49, 54) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 26),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 26),I= 61, 66) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 26),I= 67, 72) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 26),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 26),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 26),I= 85, 90) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 26),I= 91, 96) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 26),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 26),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 26),I=109,114) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 26),I=115,120) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
+      DATA (CF(I),I=2701,2795) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142
+     $ ,38,-88,-268,992,-124,1136,-142,1028,1010,20,200,-124,1028,-106
+     $ ,-88,2,20,20,-124,-124,56,56,-88,-124,-106,-232,884,-88,-232
+     $ ,1028,-88,272,-232,-160,20,992,-124,38,-106,-16,2,-160,20,-142,
+     $ -124,-448,56,992,-124,-88,-268,-268,884,1010,-268,-232,884/
 C     1 T(2,1,5,7,6,3,4)
-      DATA (CF(I, 27),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 27),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 27),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 27),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 27),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 27),I= 43, 48) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 27),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 27),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I= 61, 66) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 27),I= 67, 72) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 27),I= 73, 78) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 27),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 27),I= 85, 90) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 27),I= 91, 96) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 27),I= 97,102) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 27),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 27),I=109,114) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 27),I=115,120) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
+      DATA (CF(I),I=2796,2889) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,-160,20,992,-124,38,-106,-16,2,-160,20,-142,-124,-448,56,992,
+     $ -124,-88,-268,-268,884,1010,-268,-232,884,1280,-160,-160,20,20
+     $ ,200,128,-16,-16,2,2,20,992,-124,1136,-142,1028,1010,-124,-106,
+     $ -142,38,-88,-268,200,20,-106,-88,-124,1028,20,2,-124,56,20,-124
+     $ ,-88,-232,1028,-88,272,-232,56,-88,-124,-106,-232,884/
 C     1 T(2,1,6,5,7,3,4)
-      DATA (CF(I, 28),I=  1,  6) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 28),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 28),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 28),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 28),I= 25, 30) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 28),I= 31, 36) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 28),I= 37, 42) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 28),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 28),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 28),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 28),I= 61, 66) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 28),I= 67, 72) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 28),I= 73, 78) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 28),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 28),I= 85, 90) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 28),I= 91, 96) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 28),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 28),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 28),I=109,114) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 28),I=115,120) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=2890,2982) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20
+     $ ,200,-124,1028,-106,-88,2,20,20,-124,-124,56,56,-88,-124,-106,
+     $ -232,884,-88,-232,1028,-88,272,-232,-160,1280,20,200,-160,20,
+     $ -16,128,2,20,-16,2,-124,-106,-142,38,-88,-268,992,-124,1136,
+     $ -142,1028,1010,20,-160,38,-106,992,-124,2,-16,-142,-124,-160,20
+     $ ,-268,884,1010,-268,-232,884,-448,56,992,-124,-88,-268/
 C     1 T(2,1,6,7,5,3,4)
-      DATA (CF(I, 29),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 29),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 29),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 29),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 29),I= 25, 30) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 29),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 29),I= 37, 42) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 29),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 29),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 29),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 29),I= 61, 66) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 29),I= 67, 72) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 29),I= 73, 78) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 29),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 29),I= 85, 90) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 29),I= 91, 96) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 29),I= 97,102) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 29),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 29),I=109,114) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 29),I=115,120) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=2983,3074) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160
+     $ ,38,-106,992,-124,2,-16,-142,-124,-160,20,-268,884,1010,-268,
+     $ -232,884,-448,56,992,-124,-88,-268,200,20,-106,-88,-124,1028,20
+     $ ,2,-124,56,20,-124,-88,-232,1028,-88,272,-232,56,-88,-124,-106,
+     $ -232,884,1280,-160,-160,20,20,200,128,-16,-16,2,2,20,992,-124
+     $ ,1136,-142,1028,1010,-124,-106,-142,38,-88,-268/
 C     1 T(2,1,7,5,6,3,4)
-      DATA (CF(I, 30),I=  1,  6) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 30),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 30),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 30),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 30),I= 25, 30) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 30),I= 31, 36) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 30),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 30),I= 43, 48) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 30),I= 49, 54) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 30),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 30),I= 61, 66) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 30),I= 67, 72) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 30),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 30),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 30),I= 85, 90) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 30),I= 91, 96) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 30),I= 97,102) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 30),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 30),I=109,114) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 30),I=115,120) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
+      DATA (CF(I),I=3075,3165) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,-106,
+     $ -88,-124,1028,20,2,-124,56,20,-124,-88,-232,1028,-88,272,-232
+     $ ,56,-88,-124,-106,-232,884,20,-160,38,-106,992,-124,2,-16,-142,
+     $ -124,-160,20,-268,884,1010,-268,-232,884,-448,56,992,-124,-88,
+     $ -268,-160,1280,20,200,-160,20,-16,128,2,20,-16,2,-124,-106,-142
+     $ ,38,-88,-268,992,-124,1136,-142,1028,1010/
 C     1 T(2,1,7,6,5,3,4)
-      DATA (CF(I, 31),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 31),I=  7, 12) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 31),I= 13, 18) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 31),I= 19, 24) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 31),I= 25, 30) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 31),I= 31, 36) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 31),I= 37, 42) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 31),I= 43, 48) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 31),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 31),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 31),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 31),I= 67, 72) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 31),I= 73, 78) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 31),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 31),I= 85, 90) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 31),I= 91, 96) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 31),I= 97,102) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 31),I=103,108) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 31),I=109,114) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 31),I=115,120) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=3166,3255) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,128,-16,-16,2
+     $ ,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124,-142,20,
+     $ -124,2,20,56,-124,992,-124,-448,56,-268,-88,-160,20,-16,2,-124,
+     $ -142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,-124,
+     $ -106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028,20,200,-88,
+     $ -106,-232,272,-88,1028,-232,-88/
 C     1 T(2,5,1,6,7,3,4)
-      DATA (CF(I, 32),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 32),I=  7, 12) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 32),I= 13, 18) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 32),I= 19, 24) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 32),I= 25, 30) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 32),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 32),I= 37, 42) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 32),I= 43, 48) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 32),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 32),I= 55, 60) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 32),I= 61, 66) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 32),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 32),I= 73, 78) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 32),I= 79, 84) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 32),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 32),I= 91, 96) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 32),I= 97,102) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 32),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 32),I=109,114) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 32),I=115,120) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=3256,3344) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,-16,128,2,20,-16,2,128
+     $ ,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160,20,-16,2,-124
+     $ ,-142,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124,-124,1028
+     $ ,20,200,-88,-106,-232,272,-88,1028,-232,-88,992,-124,-448,56,
+     $ -268,-88,-160,20,-16,2,-124,-142,992,-124,-160,20,-106,38,884,
+     $ -232,-268,1010,884,-268/
 C     1 T(2,5,1,7,6,3,4)
-      DATA (CF(I, 33),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 33),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 33),I= 13, 18) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 33),I= 19, 24) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 33),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 33),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 33),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 33),I= 43, 48) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 33),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 33),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 33),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 33),I= 67, 72) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 33),I= 73, 78) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 33),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 33),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 33),I= 91, 96) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 33),I= 97,102) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 33),I=103,108) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 33),I=109,114) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 33),I=115,120) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=3345,3432) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,-16,2,-160,20,-142,-124,128
+     $ ,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56,20,2,-124,20
+     $ ,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160,20,1280,
+     $ -160,200,20,-268,-88,38,-142,-106,-124,1028,-88,-88,-232,-232
+     $ ,272,-124,56,20,2,-124,20,-106,-88,200,20,1028,-124,884,-232,
+     $ -106,-124,-88,56/
 C     1 T(2,5,6,1,7,3,4)
-      DATA (CF(I, 34),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 34),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 34),I= 13, 18) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 34),I= 19, 24) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 34),I= 25, 30) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 34),I= 31, 36) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 34),I= 37, 42) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 34),I= 43, 48) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 34),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 34),I= 55, 60) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 34),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 34),I= 67, 72) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 34),I= 73, 78) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 34),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 34),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 34),I= 91, 96) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 34),I= 97,102) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 34),I=103,108) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 34),I=109,114) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 34),I=115,120) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
+      DATA (CF(I),I=3433,3519) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,2,20,20,-124,-124,56,-16,-160
+     $ ,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16,20,-160,
+     $ -142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160,1280
+     $ ,20,-160,1010,1028,-142,1136,-124,992,1010,-268,-268,884,884,
+     $ -232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124,992,-268,-88,
+     $ -124,992,56,-448/
 C     1 T(2,5,6,7,1,3,4)
-      DATA (CF(I, 35),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 35),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 35),I= 13, 18) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 35),I= 19, 24) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 35),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 35),I= 31, 36) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 35),I= 37, 42) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 35),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 35),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 35),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 35),I= 61, 66) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 35),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 35),I= 73, 78) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 35),I= 79, 84) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 35),I= 85, 90) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 35),I= 91, 96) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 35),I= 97,102) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 35),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 35),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 35),I=115,120) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=3520,3605) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,2,-16,-142,-124,-160,20,-16,128,
+     $ -160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16,20,2,1028
+     $ ,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200,20
+     $ ,1028,-124,884,-232,-106,-124,-88,56,1136,-142,992,-124,1010
+     $ ,1028,-16,2,128,-16,20,2,-160,20,1280,-160,200,20,-268,-88,38,
+     $ -142,-106,-124/
 C     1 T(2,5,7,1,6,3,4)
-      DATA (CF(I, 36),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 36),I=  7, 12) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 36),I= 13, 18) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 36),I= 19, 24) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 36),I= 25, 30) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 36),I= 31, 36) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 36),I= 37, 42) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 36),I= 43, 48) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 36),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 36),I= 55, 60) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 36),I= 61, 66) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 36),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 36),I= 73, 78) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 36),I= 79, 84) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 36),I= 85, 90) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 36),I= 91, 96) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 36),I= 97,102) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 36),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 36),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 36),I=115,120) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=3606,3690) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,20,2,-124,56,20,-124,-160,-16,-16,128
+     $ ,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2,-16,1010,-268
+     $ ,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160,-124
+     $ ,992,-268,-88,-124,992,56,-448,-142,38,-124,-106,-268,-88,2,20,
+     $ -16,128,2,-16,20,200,-160,1280,20,-160,1010,1028,-142,1136,-124
+     $ ,992/
 C     1 T(2,5,7,6,1,3,4)
-      DATA (CF(I, 37),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 37),I=  7, 12) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 37),I= 13, 18) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 37),I= 19, 24) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 37),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 37),I= 31, 36) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 37),I= 37, 42) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 37),I= 43, 48) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 37),I= 49, 54) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 37),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 37),I= 61, 66) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 37),I= 67, 72) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 37),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 37),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 37),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 37),I= 91, 96) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 37),I= 97,102) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 37),I=103,108) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 37),I=109,114) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 37),I=115,120) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
+      DATA (CF(I),I=3691,3774) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,992,-124,-448,56,-268,-88,-160,20,-16,2,-124
+     $ ,-142,992,-124,-160,20,-106,38,884,-232,-268,1010,884,-268,128,
+     $ -16,-16,2,2,20,-1024,128,128,-16,-16,-160,-160,20,-16,2,-124,
+     $ -142,20,-124,2,20,56,-124,-106,-124,884,-232,56,-88,-124,20,56,
+     $ -124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20,200,-88,-106/
 C     1 T(2,6,1,5,7,3,4)
-      DATA (CF(I, 38),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 38),I=  7, 12) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 38),I= 13, 18) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 38),I= 19, 24) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 38),I= 25, 30) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 38),I= 31, 36) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 38),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 38),I= 43, 48) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 38),I= 49, 54) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 38),I= 55, 60) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 38),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 38),I= 67, 72) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 38),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 38),I= 79, 84) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 38),I= 85, 90) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 38),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 38),I= 97,102) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 38),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 38),I=109,114) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 38),I=115,120) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
+      DATA (CF(I),I=3775,3857) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,-124,-106,56,-88,884,-232,20,-124,2,20,56,-124,
+     $ -124,1028,20,200,-88,-106,-232,272,-88,1028,-232,-88,-16,128,2
+     $ ,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2,20,56,-124,-160
+     $ ,20,-16,2,-124,-142,-124,992,-268,-88,-448,56,20,-160,-124,-142
+     $ ,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20,-106,38/
 C     1 T(2,6,1,7,5,3,4)
-      DATA (CF(I, 39),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 39),I=  7, 12) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 39),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 39),I= 19, 24) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 39),I= 25, 30) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 39),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 39),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 39),I= 43, 48) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 39),I= 49, 54) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 39),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 39),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 39),I= 67, 72) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 39),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 39),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 39),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 39),I= 91, 96) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 39),I= 97,102) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 39),I=103,108) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 39),I=109,114) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 39),I=115,120) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
+      DATA (CF(I),I=3858,3939) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,1136,-142,992,-124,1010,1028,-16,2,128,-16,20,2,-160
+     $ ,20,1280,-160,200,20,-268,-88,38,-142,-106,-124,-16,2,-160,20,
+     $ -142,-124,128,-16,-1024,128,-160,-16,-16,2,128,-16,20,2,-124,56
+     $ ,20,2,-124,20,-88,1028,-232,272,-88,-232,56,-124,-124,20,20,2
+     $ ,884,-232,-106,-124,-88,56,-106,-88,200,20,1028,-124/
 C     1 T(2,6,5,1,7,3,4)
-      DATA (CF(I, 40),I=  1,  6) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 40),I=  7, 12) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 40),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 40),I= 19, 24) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 40),I= 25, 30) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 40),I= 31, 36) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 40),I= 37, 42) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 40),I= 43, 48) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 40),I= 49, 54) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 40),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 40),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 40),I= 67, 72) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 40),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 40),I= 79, 84) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 40),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 40),I= 91, 96) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 40),I= 97,102) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 40),I=103,108) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 40),I=109,114) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 40),I=115,120) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=3940,4020) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,-142,38,-124,-106,-268,-88,2,20,-16,128,2,-16,20,200,-160
+     $ ,1280,20,-160,1010,1028,-142,1136,-124,992,2,20,20,-124,-124,56
+     $ ,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16,-142,-124,2,-16
+     $ ,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160,2,-16,
+     $ -268,-88,-124,992,56,-448,38,-106,20,-160,-124,992/
 C     1 T(2,6,5,7,1,3,4)
-      DATA (CF(I, 41),I=  1,  6) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 41),I=  7, 12) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 41),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 41),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 41),I= 25, 30) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 41),I= 31, 36) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 41),I= 37, 42) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 41),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 41),I= 49, 54) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 41),I= 55, 60) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 41),I= 61, 66) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 41),I= 67, 72) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 41),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 41),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 41),I= 85, 90) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 41),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 41),I= 97,102) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 41),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 41),I=109,114) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 41),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=4021,4100) /4096,-1024,-16,128,-160,-16,-1024,128
+     $ ,1028,-88,-88,-232,-232,272,-124,56,20,2,-124,20,-106,-88,200
+     $ ,20,1028,-124,884,-232,-106,-124,-88,56,2,-16,-142,-124,-160,20
+     $ ,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2,128,-16
+     $ ,20,2,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268,-88
+     $ ,38,-142,-106,-124,-160,20,1280,-160,200,20/
 C     1 T(2,6,7,1,5,3,4)
-      DATA (CF(I, 42),I=  1,  6) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 42),I=  7, 12) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 42),I= 13, 18) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 42),I= 19, 24) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 42),I= 25, 30) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 42),I= 31, 36) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 42),I= 37, 42) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 42),I= 43, 48) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 42),I= 49, 54) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 42),I= 55, 60) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 42),I= 61, 66) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 42),I= 67, 72) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 42),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 42),I= 79, 84) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 42),I= 85, 90) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 42),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 42),I= 97,102) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 42),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 42),I=109,114) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 42),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=4101,4179) /4096,-160,-16,-16,128,128,-1024,1010,
+     $ -268,-268,884,884,-232,-142,-124,2,-16,20,-160,38,-106,20,-160,
+     $ -124,992,-268,-88,-124,992,56,-448,20,2,-124,56,20,-124,-160,
+     $ -16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2,20,-16,128,2,
+     $ -16,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010,1028,
+     $ -142,1136,-124,992,20,200,-160,1280,20,-160/
 C     1 T(2,6,7,5,1,3,4)
-      DATA (CF(I, 43),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 43),I=  7, 12) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 43),I= 13, 18) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 43),I= 19, 24) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 43),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 43),I= 31, 36) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 43),I= 37, 42) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 43),I= 43, 48) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 43),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 43),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 43),I= 61, 66) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 43),I= 67, 72) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 43),I= 73, 78) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 43),I= 79, 84) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 43),I= 85, 90) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 43),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 43),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 43),I=103,108) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 43),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 43),I=115,120) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
+      DATA (CF(I),I=4180,4257) /4096,-1024,-1024,128,128,1280,-124,992
+     $ ,-268,-88,-448,56,20,-160,-124,-142,-16,2,884,-232,-268,1010
+     $ ,884,-268,992,-124,-160,20,-106,38,-106,-124,884,-232,56,-88,
+     $ -124,20,56,-124,2,20,-232,272,-88,1028,-232,-88,-124,1028,20
+     $ ,200,-88,-106,128,-16,-16,2,2,20,-1024,128,128,-16,-16,-160,
+     $ -160,20,-16,2,-124,-142,20,-124,2,20,56,-124/
 C     1 T(2,7,1,5,6,3,4)
-      DATA (CF(I, 44),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 44),I=  7, 12) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 44),I= 13, 18) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 44),I= 19, 24) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 44),I= 25, 30) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 44),I= 31, 36) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 44),I= 37, 42) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 44),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 44),I= 49, 54) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 44),I= 55, 60) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 44),I= 61, 66) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 44),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 44),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 44),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 44),I= 85, 90) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 44),I= 91, 96) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 44),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 44),I=103,108) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 44),I=109,114) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 44),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
+      DATA (CF(I),I=4258,4334) /4096,128,1280,-1024,128,-106,-124,884,
+     $ -232,56,-88,-124,20,56,-124,2,20,-232,272,-88,1028,-232,-88,
+     $ -124,1028,20,200,-88,-106,-124,992,-268,-88,-448,56,20,-160,
+     $ -124,-142,-16,2,884,-232,-268,1010,884,-268,992,-124,-160,20,
+     $ -106,38,-16,128,2,20,-16,2,128,-1024,-16,-160,128,-16,20,-124,2
+     $ ,20,56,-124,-160,20,-16,2,-124,-142/
 C     1 T(2,7,1,6,5,3,4)
-      DATA (CF(I, 45),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 45),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 45),I= 13, 18) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 45),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 45),I= 25, 30) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 45),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 45),I= 37, 42) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 45),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 45),I= 49, 54) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 45),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 45),I= 61, 66) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 45),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 45),I= 73, 78) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 45),I= 79, 84) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 45),I= 85, 90) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 45),I= 91, 96) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 45),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 45),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 45),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 45),I=115,120) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
+      DATA (CF(I),I=4335,4410) /4096,-1024,1280,128,-142,1136,1010
+     $ ,1028,992,-124,2,-16,20,2,128,-16,-268,-88,38,-142,-106,-124,
+     $ -160,20,1280,-160,200,20,-88,1028,-232,272,-88,-232,56,-124,
+     $ -124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20,1028,
+     $ -124,-16,2,-160,20,-142,-124,128,-16,-1024,128,-160,-16,-16,2
+     $ ,128,-16,20,2,-124,56,20,2,-124,20/
 C     1 T(2,7,5,1,6,3,4)
-      DATA (CF(I, 46),I=  1,  6) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 46),I=  7, 12) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 46),I= 13, 18) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 46),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 46),I= 25, 30) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 46),I= 31, 36) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 46),I= 37, 42) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 46),I= 43, 48) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 46),I= 49, 54) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 46),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 46),I= 61, 66) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 46),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 46),I= 73, 78) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 46),I= 79, 84) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 46),I= 85, 90) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 46),I= 91, 96) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 46),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 46),I=103,108) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 46),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 46),I=115,120) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=4411,4485) /4096,128,-1024,38,-142,-268,-88,-124,
+     $ -106,20,2,2,-16,-16,128,1010,1028,-142,1136,-124,992,20,200,
+     $ -160,1280,20,-160,-268,1010,884,-232,-268,884,-124,-142,20,-160
+     $ ,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160,-124,992,2,20
+     $ ,20,-124,-124,56,-16,-160,128,-1024,-16,128,2,20,-16,128,2,-16,
+     $ -142,-124,2,-16,20,-160/
 C     1 T(2,7,5,6,1,3,4)
-      DATA (CF(I, 47),I=  1,  6) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 47),I=  7, 12) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 47),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 47),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 47),I= 25, 30) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 47),I= 31, 36) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 47),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 47),I= 43, 48) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 47),I= 49, 54) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 47),I= 55, 60) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 47),I= 61, 66) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 47),I= 67, 72) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 47),I= 73, 78) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 47),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 47),I= 85, 90) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 47),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 47),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 47),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 47),I=109,114) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 47),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=4486,4559) /4096,-1024,-88,1028,-232,272,-88,-232
+     $ ,56,-124,-124,20,20,2,884,-232,-106,-124,-88,56,-106,-88,200,20
+     $ ,1028,-124,-142,1136,1010,1028,992,-124,2,-16,20,2,128,-16,-268
+     $ ,-88,38,-142,-106,-124,-160,20,1280,-160,200,20,2,-16,-142,-124
+     $ ,-160,20,-16,128,-160,-16,-1024,128,-124,56,20,2,-124,20,-16,2
+     $ ,128,-16,20,2/
 C     1 T(2,7,6,1,5,3,4)
-      DATA (CF(I, 48),I=  1,  6) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 48),I=  7, 12) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 48),I= 13, 18) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 48),I= 19, 24) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 48),I= 25, 30) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 48),I= 31, 36) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 48),I= 37, 42) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 48),I= 43, 48) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 48),I= 49, 54) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 48),I= 55, 60) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 48),I= 61, 66) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 48),I= 67, 72) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 48),I= 73, 78) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 48),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 48),I= 85, 90) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 48),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 48),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 48),I=103,108) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 48),I=109,114) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 48),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=4560,4632) /4096,-268,1010,884,-232,-268,884,-124,
+     $ -142,20,-160,2,-16,-268,-88,-124,992,56,-448,38,-106,20,-160,
+     $ -124,992,38,-142,-268,-88,-124,-106,20,2,2,-16,-16,128,1010
+     $ ,1028,-142,1136,-124,992,20,200,-160,1280,20,-160,20,2,-124,56
+     $ ,20,-124,-160,-16,-16,128,128,-1024,-142,-124,2,-16,20,-160,2
+     $ ,20,-16,128,2,-16/
 C     1 T(2,7,6,5,1,3,4)
-      DATA (CF(I, 49),I=  1,  6) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 49),I=  7, 12) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 49),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 49),I= 19, 24) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 49),I= 25, 30) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 49),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 49),I= 37, 42) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 49),I= 43, 48) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 49),I= 49, 54) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 49),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 49),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 49),I= 67, 72) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 49),I= 73, 78) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 49),I= 79, 84) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 49),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 49),I= 91, 96) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 49),I= 97,102) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 49),I=103,108) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 49),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 49),I=115,120) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
+      DATA (CF(I),I=4633,4704) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448,992,-124,-160,20,-106,38,-448,56,992,-124,-88,
+     $ -268,-16,2,-160,20,-142,-124,884,-268,-232,884,1010,-268,-124
+     $ ,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20,20,-124,
+     $ -124,56,-232,-88,272,-232,1028,-88/
 C     1 T(5,1,2,6,7,3,4)
-      DATA (CF(I, 50),I=  1,  6) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 50),I=  7, 12) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 50),I= 13, 18) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 50),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 50),I= 25, 30) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 50),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 50),I= 37, 42) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 50),I= 43, 48) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 50),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 50),I= 55, 60) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 50),I= 61, 66) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 50),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 50),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 50),I= 79, 84) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 50),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 50),I= 91, 96) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 50),I= 97,102) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 50),I=103,108) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 50),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 50),I=115,120) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
+      DATA (CF(I),I=4705,4775) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,-124,1028,20,200,-88,-106,56,-88,-124,-106,-232,884,2,20
+     $ ,20,-124,-124,56,-232,-88,272,-232,1028,-88,992,-124,-160,20,
+     $ -106,38,-448,56,992,-124,-88,-268,-16,2,-160,20,-142,-124,884,
+     $ -268,-232,884,1010,-268/
 C     1 T(5,1,2,7,6,3,4)
-      DATA (CF(I, 51),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 51),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 51),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 51),I= 19, 24) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 51),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 51),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 51),I= 37, 42) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 51),I= 43, 48) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 51),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 51),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 51),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 51),I= 67, 72) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 51),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 51),I= 79, 84) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 51),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 51),I= 91, 96) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 51),I= 97,102) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 51),I=103,108) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 51),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 51),I=115,120) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
+      DATA (CF(I),I=4776,4845) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,-160,20,1280,-160,200,20,992,-124,1136,-142,1028,1010,128,-16,
+     $ -16,2,2,20,-106,-124,-88,-268,-142,38,-106,-88,200,20,1028,-124
+     $ ,-88,-232,1028,-88,272,-232,20,2,-124,56,20,-124,-88,56,-232
+     $ ,884,-124,-106/
 C     1 T(5,1,6,2,7,3,4)
-      DATA (CF(I, 52),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 52),I=  7, 12) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 52),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 52),I= 19, 24) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 52),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 52),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 52),I= 37, 42) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 52),I= 43, 48) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 52),I= 49, 54) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 52),I= 55, 60) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 52),I= 61, 66) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 52),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 52),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 52),I= 79, 84) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 52),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 52),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 52),I= 97,102) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 52),I=103,108) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 52),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 52),I=115,120) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=4846,4914) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,20
+     $ ,200,-160,1280,20,-160,-124,-106,-142,38,-88,-268,-16,128,2,20,
+     $ -16,2,-124,992,1028,1010,1136,-142,38,-106,20,-160,-124,992,
+     $ -268,884,1010,-268,-232,884,2,-16,-142,-124,-160,20,56,-448,-88
+     $ ,-268,992,-124/
 C     1 T(5,1,6,7,2,3,4)
-      DATA (CF(I, 53),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 53),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 53),I= 13, 18) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 53),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 53),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 53),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 53),I= 37, 42) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 53),I= 43, 48) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 53),I= 49, 54) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 53),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 53),I= 61, 66) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 53),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 53),I= 73, 78) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 53),I= 79, 84) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 53),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 53),I= 91, 96) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 53),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 53),I=103,108) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 53),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 53),I=115,120) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
+      DATA (CF(I),I=4915,4982) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,-106,-88
+     $ ,200,20,1028,-124,-88,-232,1028,-88,272,-232,20,2,-124,56,20,
+     $ -124,-88,56,-232,884,-124,-106,-160,20,1280,-160,200,20,992,
+     $ -124,1136,-142,1028,1010,128,-16,-16,2,2,20,-106,-124,-88,-268,
+     $ -142,38/
 C     1 T(5,1,7,2,6,3,4)
-      DATA (CF(I, 54),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 54),I=  7, 12) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 54),I= 13, 18) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 54),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 54),I= 25, 30) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 54),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 54),I= 37, 42) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 54),I= 43, 48) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 54),I= 49, 54) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 54),I= 55, 60) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 54),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 54),I= 67, 72) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 54),I= 73, 78) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 54),I= 79, 84) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 54),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 54),I= 91, 96) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 54),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 54),I=103,108) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 54),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 54),I=115,120) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
+      DATA (CF(I),I=4983,5049) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,38,-106,20,
+     $ -160,-124,992,-268,884,1010,-268,-232,884,2,-16,-142,-124,-160
+     $ ,20,56,-448,-88,-268,992,-124,20,200,-160,1280,20,-160,-124,
+     $ -106,-142,38,-88,-268,-16,128,2,20,-16,2,-124,992,1028,1010
+     $ ,1136,-142/
 C     1 T(5,1,7,6,2,3,4)
-      DATA (CF(I, 55),I=  1,  6) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 55),I=  7, 12) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 55),I= 13, 18) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 55),I= 19, 24) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 55),I= 25, 30) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 55),I= 31, 36) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 55),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 55),I= 43, 48) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 55),I= 49, 54) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 55),I= 55, 60) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 55),I= 61, 66) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 55),I= 67, 72) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 55),I= 73, 78) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 55),I= 79, 84) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 55),I= 85, 90) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 55),I= 91, 96) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 55),I= 97,102) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 55),I=103,108) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 55),I=109,114) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 55),I=115,120) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
+      DATA (CF(I),I=5050,5115) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-448,56,992,
+     $ -124,-88,-268,992,-124,-160,20,-106,38,-160,20,-16,2,-124,-142,
+     $ -232,884,884,-268,-268,1010,56,-88,-124,-106,-232,884,-124,1028
+     $ ,20,200,-88,-106,20,-124,2,20,56,-124,272,-232,-232,-88,-88
+     $ ,1028/
 C     1 T(5,2,1,6,7,3,4)
-      DATA (CF(I, 56),I=  1,  6) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-2.469135802469136D-01,3.086419753086420D-02/
-      DATA (CF(I, 56),I=  7, 12) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 56),I= 13, 18) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 56),I= 19, 24) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 56),I= 25, 30) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 56),I= 31, 36) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 56),I= 37, 42) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 56),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 56),I= 49, 54) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 56),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 56),I= 61, 66) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 56),I= 67, 72) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 56),I= 73, 78) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 56),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 56),I= 85, 90) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 56),I= 91, 96) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 56),I= 97,102) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 56),I=103,108) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 56),I=109,114) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 56),I=115,120) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
+      DATA (CF(I),I=5116,5180) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,56,-88,-124,-106,-232
+     $ ,884,-124,1028,20,200,-88,-106,20,-124,2,20,56,-124,272,-232,
+     $ -232,-88,-88,1028,-448,56,992,-124,-88,-268,992,-124,-160,20,
+     $ -106,38,-160,20,-16,2,-124,-142,-232,884,884,-268,-268,1010/
 C     1 T(5,2,1,7,6,3,4)
-      DATA (CF(I, 57),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 57),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 57),I= 13, 18) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 57),I= 19, 24) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 57),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 57),I= 31, 36) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 57),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 57),I= 43, 48) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 57),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 57),I= 55, 60) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 57),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 57),I= 67, 72) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 57),I= 73, 78) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 57),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 57),I= 85, 90) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 57),I= 91, 96) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 57),I= 97,102) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 57),I=103,108) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 57),I=109,114) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 57),I=115,120) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=5181,5244) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,992,-124,1136,-142,1028
+     $ ,1010,-160,20,1280,-160,200,20,-16,2,128,-16,20,2,-88,-268,-106
+     $ ,-124,38,-142,-88,-232,1028,-88,272,-232,-106,-88,200,20,1028,
+     $ -124,-124,56,20,2,-124,20,-232,884,-88,56,-106,-124/
 C     1 T(5,2,6,1,7,3,4)
-      DATA (CF(I, 58),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00,-1.635802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 58),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 58),I= 13, 18) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 58),I= 19, 24) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 58),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 58),I= 31, 36) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 58),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 58),I= 43, 48) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 58),I= 49, 54) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 58),I= 55, 60) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 58),I= 61, 66) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 58),I= 67, 72) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 58),I= 73, 78) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 58),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 58),I= 85, 90) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 58),I= 91, 96) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 58),I= 97,102) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 58),I=103,108) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 58),I=109,114) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 58),I=115,120) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=5245,5307) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,-124,-106,-142,38,-88,-268,20
+     $ ,200,-160,1280,20,-160,2,20,-16,128,2,-16,1028,1010,-124,992,
+     $ -142,1136,-268,884,1010,-268,-232,884,38,-106,20,-160,-124,992,
+     $ -142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992/
 C     1 T(5,2,6,7,1,3,4)
-      DATA (CF(I, 59),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,5.864197530864197D-02,
-     $ -1.635802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 59),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 13, 18) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 59),I= 19, 24) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 59),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 31, 36) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 59),I= 37, 42) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 59),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 59),I= 55, 60) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 59),I= 61, 66) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 59),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 59),I= 73, 78) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 59),I= 79, 84) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 59),I= 85, 90) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 59),I= 91, 96) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 59),I= 97,102) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 59),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 59),I=109,114) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 59),I=115,120) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
+      DATA (CF(I),I=5308,5369) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,-88,-232,1028,-88,272,-232,-106,-88
+     $ ,200,20,1028,-124,-124,56,20,2,-124,20,-232,884,-88,56,-106,
+     $ -124,992,-124,1136,-142,1028,1010,-160,20,1280,-160,200,20,-16
+     $ ,2,128,-16,20,2,-88,-268,-106,-124,38,-142/
 C     1 T(5,2,7,1,6,3,4)
-      DATA (CF(I, 60),I=  1,  6) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 60),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 60),I= 13, 18) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 60),I= 19, 24) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 60),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 60),I= 31, 36) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 60),I= 37, 42) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 60),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 60),I= 49, 54) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 60),I= 55, 60) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 60),I= 61, 66) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 60),I= 67, 72) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 60),I= 73, 78) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 60),I= 79, 84) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 60),I= 85, 90) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 60),I= 91, 96) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 60),I= 97,102) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 60),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 60),I=109,114) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 60),I=115,120) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
+      DATA (CF(I),I=5370,5430) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,-268,884,1010,-268,-232,884,38,-106,20,
+     $ -160,-124,992,-142,-124,2,-16,20,-160,-88,-268,56,-448,-124,992
+     $ ,-124,-106,-142,38,-88,-268,20,200,-160,1280,20,-160,2,20,-16
+     $ ,128,2,-16,1028,1010,-124,992,-142,1136/
 C     1 T(5,2,7,6,1,3,4)
-      DATA (CF(I, 61),I=  1,  6) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 61),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 61),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 61),I= 19, 24) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 61),I= 25, 30) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 61),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 61),I= 37, 42) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 61),I= 43, 48) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 61),I= 49, 54) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 61),I= 55, 60) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 61),I= 61, 66) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 61),I= 67, 72) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 61),I= 73, 78) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 61),I= 79, 84) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 61),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 61),I= 91, 96) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 61),I= 97,102) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 61),I=103,108) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 61),I=109,114) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 61),I=115,120) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
+      DATA (CF(I),I=5431,5490) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,-16,2,128,-16,20,2,-160,20,-16,2,-124,-142,
+     $ -1024,128,128,-16,-16,-160,-124,20,56,-124,2,20,884,-232,-106,
+     $ -124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2,20
+     $ ,1028,-124,-88,-106,20,200/
 C     1 T(5,6,1,2,7,3,4)
-      DATA (CF(I, 62),I=  1,  6) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 62),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 62),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 62),I= 19, 24) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 62),I= 25, 30) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 62),I= 31, 36) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 62),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 62),I= 43, 48) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 62),I= 49, 54) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 62),I= 55, 60) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 62),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 62),I= 67, 72) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 62),I= 73, 78) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 62),I= 79, 84) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 62),I= 85, 90) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 62),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 62),I= 97,102) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 62),I=103,108) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 62),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 62),I=115,120) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
+      DATA (CF(I),I=5491,5549) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,2,20,-16,128,2,-16,20,-124,2,20,56,-124,128,-1024
+     $ ,-16,-160,128,-16,20,-160,-124,-142,-16,2,-268,-88,-124,992,56,
+     $ -448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16,2,-124
+     $ ,992,-106,38,-160,20/
 C     1 T(5,6,1,7,2,3,4)
-      DATA (CF(I, 63),I=  1,  6) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 63),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 63),I= 13, 18) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 63),I= 19, 24) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 63),I= 25, 30) /1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.558641975308642D+00,1.586419753086420D
-     $ +00/
-      DATA (CF(I, 63),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 63),I= 43, 48) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 63),I= 49, 54) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 63),I= 55, 60) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 63),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 63),I= 67, 72) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 63),I= 73, 78) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 63),I= 79, 84) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 63),I= 91, 96) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I= 97,102) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 63),I=103,108) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 63),I=109,114) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 63),I=115,120) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=5550,5607) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,-160,20,-16,2,-124,-142,-16,2,128,-16,20,2,128,-16,
+     $ -1024,128,-160,-16,56,-124,-124,20,20,2,-232,272,-88,1028,-232,
+     $ -88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88,-106
+     $ ,1028,-124,200,20/
 C     1 T(5,6,2,1,7,3,4)
-      DATA (CF(I, 64),I=  1,  6) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 64),I=  7, 12) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 64),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 64),I= 19, 24) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 64),I= 25, 30) /-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 64),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 64),I= 43, 48) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 64),I= 49, 54) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 64),I= 55, 60) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 64),I= 61, 66) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 64),I= 67, 72) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 64),I= 73, 78) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 64),I= 79, 84) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I= 85, 90) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 64),I= 91, 96) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I= 97,102) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 64),I=103,108) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 64),I=109,114) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 64),I=115,120) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=5608,5664) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16,-160,128,-1024
+     $ ,-16,128,-124,-142,20,-160,2,-16,884,-232,-268,1010,884,-268,
+     $ -268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38,-124
+     $ ,992,20,-160/
 C     1 T(5,6,2,7,1,3,4)
-      DATA (CF(I, 65),I=  1,  6) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 65),I=  7, 12) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 65),I= 13, 18) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 65),I= 19, 24) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 65),I= 25, 30) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 65),I= 31, 36) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 65),I= 37, 42) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 65),I= 43, 48) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 65),I= 49, 54) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 65),I= 55, 60) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 65),I= 61, 66) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 65),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 65),I= 73, 78) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 65),I= 79, 84) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 65),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 65),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 65),I= 97,102) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 65),I=103,108) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 65),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 65),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
+      DATA (CF(I),I=5665,5720) /4096,-1024,-16,128,-160,-16,-1024,128,
+     $ -142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16,128,-160,-16,
+     $ -1024,128,2,-16,20,2,128,-16,1010,1028,-142,1136,-124,992,-268,
+     $ -88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200,20,1280,
+     $ -160/
 C     1 T(5,6,7,1,2,3,4)
-      DATA (CF(I, 66),I=  1,  6) /1.586419753086420D+00,
-     $ -1.358024691358025D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01
-     $ ,4.197530864197531D-01/
-      DATA (CF(I, 66),I=  7, 12) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 66),I= 13, 18) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 66),I= 19, 24) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 66),I= 25, 30) /1.558641975308642D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 66),I= 31, 36) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 66),I= 37, 42) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 66),I= 43, 48) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 66),I= 49, 54) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 66),I= 55, 60) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 66),I= 61, 66) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 66),I= 67, 72) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 66),I= 73, 78) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 66),I= 79, 84) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 66),I= 85, 90) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 66),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 66),I= 97,102) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 66),I=103,108) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 66),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 66),I=115,120) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
+      DATA (CF(I),I=5721,5775) /4096,-160,-16,-16,128,128,-1024,-124
+     $ ,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16,128,128,
+     $ -1024,20,2,2,-16,-16,128,-268,-88,38,-142,-106,-124,1010,1028,
+     $ -142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160,1280/
 C     1 T(5,6,7,2,1,3,4)
-      DATA (CF(I, 67),I=  1,  6) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 67),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 67),I= 13, 18) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 67),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 67),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 67),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 67),I= 37, 42) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 67),I= 43, 48) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 67),I= 49, 54) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 67),I= 55, 60) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 67),I= 61, 66) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 67),I= 67, 72) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 67),I= 73, 78) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 67),I= 79, 84) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 67),I= 85, 90) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 67),I= 91, 96) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 67),I= 97,102) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 67),I=103,108) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 67),I=109,114) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 67),I=115,120) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=5776,5829) /4096,-1024,-1024,128,128,1280,884,-232
+     $ ,-106,-124,-88,56,-232,272,-88,1028,-232,-88,-124,20,56,-124,2
+     $ ,20,1028,-124,-88,-106,20,200,-16,2,128,-16,20,2,-160,20,-16,2,
+     $ -124,-142,-1024,128,128,-16,-16,-160,-124,20,56,-124,2,20/
 C     1 T(5,7,1,2,6,3,4)
-      DATA (CF(I, 68),I=  1,  6) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 68),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 68),I= 13, 18) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 68),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 68),I= 25, 30) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 68),I= 31, 36) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 68),I= 37, 42) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 68),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 68),I= 49, 54) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 68),I= 55, 60) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 68),I= 61, 66) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 68),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 68),I= 73, 78) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 68),I= 79, 84) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 68),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 68),I= 91, 96) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 68),I= 97,102) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 68),I=103,108) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 68),I=109,114) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 68),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
+      DATA (CF(I),I=5830,5882) /4096,128,1280,-1024,128,-268,-88,-124
+     $ ,992,56,-448,884,-232,-268,1010,884,-268,20,-160,-124,-142,-16
+     $ ,2,-124,992,-106,38,-160,20,2,20,-16,128,2,-16,20,-124,2,20,56,
+     $ -124,128,-1024,-16,-160,128,-16,20,-160,-124,-142,-16,2/
 C     1 T(5,7,1,6,2,3,4)
-      DATA (CF(I, 69),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-6.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 69),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 69),I= 13, 18) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 69),I= 19, 24) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 69),I= 25, 30) /-2.191358024691358D-01
-     $ ,1.753086419753086D+00,1.558641975308642D+00,1.586419753086420D
-     $ +00,1.530864197530864D+00,-1.913580246913580D-01/
-      DATA (CF(I, 69),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 69),I= 37, 42) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 69),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 69),I= 49, 54) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 69),I= 55, 60) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 69),I= 61, 66) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 69),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 69),I= 73, 78) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 69),I= 79, 84) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 69),I= 85, 90) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 69),I= 91, 96) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 69),I= 97,102) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 69),I=103,108) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 69),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 69),I=115,120) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=5883,5934) /4096,-1024,1280,128,-232,272,-88,1028,
+     $ -232,-88,884,-232,-106,-124,-88,56,56,-124,-124,20,20,2,-88,
+     $ -106,1028,-124,200,20,-160,20,-16,2,-124,-142,-16,2,128,-16,20
+     $ ,2,128,-16,-1024,128,-160,-16,56,-124,-124,20,20,2/
 C     1 T(5,7,2,1,6,3,4)
-      DATA (CF(I, 70),I=  1,  6) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,1.364197530864198D+00,
-     $ -3.580246913580247D-01,8.641975308641975D-02,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 70),I=  7, 12) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 70),I= 13, 18) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 70),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 70),I= 25, 30) /5.864197530864197D-02,
-     $ -2.191358024691358D-01,-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 70),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 70),I= 37, 42) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 70),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 70),I= 49, 54) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 70),I= 55, 60) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 70),I= 61, 66) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 70),I= 67, 72) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 70),I= 73, 78) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 70),I= 79, 84) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 70),I= 85, 90) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 70),I= 91, 96) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 70),I= 97,102) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 70),I=103,108) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 70),I=109,114) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 70),I=115,120) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=5935,5985) /4096,128,-1024,884,-232,-268,1010,884,
+     $ -268,-268,-88,-124,992,56,-448,-124,-142,20,-160,2,-16,-106,38,
+     $ -124,992,20,-160,20,-124,2,20,56,-124,2,20,-16,128,2,-16,-16,
+     $ -160,128,-1024,-16,128,-124,-142,20,-160,2,-16/
 C     1 T(5,7,2,6,1,3,4)
-      DATA (CF(I, 71),I=  1,  6) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 71),I=  7, 12) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 71),I= 13, 18) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 71),I= 19, 24) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 71),I= 25, 30) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 71),I= 31, 36) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 71),I= 37, 42) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 71),I= 43, 48) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 71),I= 49, 54) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 71),I= 55, 60) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 71),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 71),I= 67, 72) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 71),I= 73, 78) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 71),I= 79, 84) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 71),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 71),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 71),I= 97,102) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 71),I=103,108) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 71),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 71),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
+      DATA (CF(I),I=5986,6035) /4096,-1024,1010,1028,-142,1136,-124
+     $ ,992,-268,-88,38,-142,-106,-124,2,-16,20,2,128,-16,20,-160,200
+     $ ,20,1280,-160,-142,-124,2,-16,20,-160,-124,56,20,2,-124,20,-16
+     $ ,128,-160,-16,-1024,128,2,-16,20,2,128,-16/
 C     1 T(5,7,6,1,2,3,4)
-      DATA (CF(I, 72),I=  1,  6) /-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 72),I=  7, 12) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 72),I= 13, 18) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 72),I= 19, 24) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 72),I= 25, 30) /-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 72),I= 31, 36) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 72),I= 37, 42) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 72),I= 43, 48) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 72),I= 49, 54) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 72),I= 55, 60) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 72),I= 61, 66) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 72),I= 67, 72) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 72),I= 73, 78) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 72),I= 79, 84) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 72),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 72),I= 91, 96) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 72),I= 97,102) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 72),I=103,108) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 72),I=109,114) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 72),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6036,6084) /4096,-268,-88,38,-142,-106,-124,1010
+     $ ,1028,-142,1136,-124,992,20,2,2,-16,-16,128,200,20,20,-160,-160
+     $ ,1280,-124,56,20,2,-124,20,-142,-124,2,-16,20,-160,-160,-16,-16
+     $ ,128,128,-1024,20,2,2,-16,-16,128/
 C     1 T(5,7,6,2,1,3,4)
-      DATA (CF(I, 73),I=  1,  6) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 73),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 73),I= 13, 18) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 73),I= 19, 24) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 73),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 73),I= 31, 36) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 73),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 73),I= 43, 48) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 73),I= 49, 54) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 73),I= 55, 60) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 73),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 73),I= 67, 72) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 73),I= 73, 78) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 73),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 73),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 73),I= 91, 96) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 73),I= 97,102) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 73),I=103,108) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 73),I=109,114) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 73),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
+      DATA (CF(I),I=6085,6132) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448,1028,-124,-88,-106,20,200,-88,56,-232,884,-124,
+     $ -106,-232,-88,272,-232,1028,-88,2,20,20,-124,-124,56/
 C     1 T(6,1,2,5,7,3,4)
-      DATA (CF(I, 74),I=  1,  6) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 74),I=  7, 12) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 74),I= 13, 18) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 74),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 74),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 74),I= 31, 36) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 74),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 74),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 74),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 74),I= 55, 60) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 74),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 74),I= 67, 72) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 74),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 74),I= 79, 84) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 74),I= 85, 90) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 74),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 74),I= 97,102) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 74),I=103,108) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 74),I=109,114) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 74),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=6133,6179) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992,-124,992,-106,38,-160,20,56,-448,-88,-268,992,-124,884,
+     $ -268,-232,884,1010,-268,-16,2,-160,20,-142,-124/
 C     1 T(6,1,2,7,5,3,4)
-      DATA (CF(I, 75),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 75),I=  7, 12) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 75),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 75),I= 19, 24) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 75),I= 25, 30) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 75),I= 31, 36) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 75),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 75),I= 43, 48) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 75),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 75),I= 55, 60) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 75),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 75),I= 67, 72) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 75),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 75),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 75),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 75),I= 91, 96) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 75),I= 97,102) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 75),I=103,108) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 75),I=109,114) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 75),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
+      DATA (CF(I),I=6180,6225) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992
+     $ ,-88,-106,1028,-124,200,20,-232,-88,272,-232,1028,-88,-88,56,
+     $ -232,884,-124,-106,20,2,-124,56,20,-124/
 C     1 T(6,1,5,2,7,3,4)
-      DATA (CF(I, 76),I=  1,  6) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 76),I=  7, 12) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 76),I= 13, 18) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 76),I= 19, 24) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 76),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 76),I= 31, 36) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 76),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 76),I= 43, 48) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 76),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 76),I= 55, 60) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 76),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 76),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 76),I= 73, 78) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 76),I= 79, 84) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 76),I= 85, 90) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 76),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 76),I= 97,102) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 76),I=103,108) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 76),I=109,114) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 76),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
+      DATA (CF(I),I=6226,6270) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160,-106
+     $ ,38,-124,992,20,-160,884,-268,-232,884,1010,-268,56,-448,-88,
+     $ -268,992,-124,2,-16,-142,-124,-160,20/
 C     1 T(6,1,5,7,2,3,4)
-      DATA (CF(I, 77),I=  1,  6) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 77),I=  7, 12) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 77),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 77),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 77),I= 25, 30) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 77),I= 31, 36) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 77),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 77),I= 43, 48) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 77),I= 49, 54) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 77),I= 55, 60) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 77),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 77),I= 67, 72) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 77),I= 73, 78) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 77),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 77),I= 85, 90) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 77),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 77),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 77),I=103,108) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 77),I=109,114) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 77),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=6271,6314) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160,20,-160
+     $ ,200,20,1280,-160,-124,992,1028,1010,1136,-142,-106,-124,-88,
+     $ -268,-142,38,128,-16,-16,2,2,20/
 C     1 T(6,1,7,2,5,3,4)
-      DATA (CF(I, 78),I=  1,  6) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 78),I=  7, 12) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 78),I= 13, 18) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 78),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 78),I= 25, 30) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 78),I= 31, 36) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 78),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 78),I= 43, 48) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 78),I= 49, 54) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 78),I= 55, 60) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 78),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 78),I= 67, 72) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 78),I= 73, 78) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 78),I= 79, 84) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 78),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 78),I= 91, 96) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 78),I= 97,102) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 78),I=103,108) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 78),I=109,114) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 78),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
+      DATA (CF(I),I=6315,6357) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16,200,20,20,
+     $ -160,-160,1280,-106,-124,-88,-268,-142,38,-124,992,1028,1010
+     $ ,1136,-142,-16,128,2,20,-16,2/
 C     1 T(6,1,7,5,2,3,4)
-      DATA (CF(I, 79),I=  1,  6) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 79),I=  7, 12) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 79),I= 13, 18) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 79),I= 19, 24) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 79),I= 25, 30) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 79),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 79),I= 37, 42) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 79),I= 43, 48) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 79),I= 49, 54) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 79),I= 55, 60) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 79),I= 61, 66) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 79),I= 67, 72) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 79),I= 73, 78) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 79),I= 79, 84) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 79),I= 85, 90) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 79),I= 91, 96) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 79),I= 97,102) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 79),I=103,108) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 79),I=109,114) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 79),I=115,120) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
+      DATA (CF(I),I=6358,6399) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992,-88,56,-232
+     $ ,884,-124,-106,1028,-124,-88,-106,20,200,272,-232,-232,-88,-88
+     $ ,1028,20,-124,2,20,56,-124/
 C     1 T(6,2,1,5,7,3,4)
-      DATA (CF(I, 80),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 80),I=  7, 12) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 80),I= 13, 18) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 80),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 80),I= 25, 30) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 80),I= 31, 36) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 80),I= 37, 42) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 80),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 80),I= 49, 54) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 80),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 80),I= 61, 66) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 80),I= 67, 72) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 80),I= 73, 78) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 80),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 80),I= 85, 90) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 80),I= 91, 96) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 80),I= 97,102) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 80),I=103,108) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 80),I=109,114) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 80),I=115,120) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
+      DATA (CF(I),I=6400,6440) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136,56,-448,-88,-268,992,
+     $ -124,-124,992,-106,38,-160,20,-232,884,884,-268,-268,1010,-160
+     $ ,20,-16,2,-124,-142/
 C     1 T(6,2,1,7,5,3,4)
-      DATA (CF(I, 81),I=  1,  6) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 81),I=  7, 12) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 81),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 81),I= 19, 24) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 81),I= 25, 30) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 81),I= 31, 36) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 81),I= 37, 42) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 81),I= 43, 48) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 81),I= 49, 54) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 81),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 81),I= 61, 66) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 81),I= 67, 72) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 81),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 81),I= 79, 84) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 81),I= 85, 90) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 81),I= 91, 96) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 81),I= 97,102) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 81),I=103,108) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 81),I=109,114) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 81),I=115,120) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
+      DATA (CF(I),I=6441,6480) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160,-232,-88,272,-232,1028,-88,
+     $ -88,-106,1028,-124,200,20,-232,884,-88,56,-106,-124,-124,56,20
+     $ ,2,-124,20/
 C     1 T(6,2,5,1,7,3,4)
-      DATA (CF(I, 82),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 82),I=  7, 12) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 82),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 82),I= 19, 24) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 82),I= 25, 30) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 82),I= 31, 36) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 82),I= 37, 42) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 82),I= 43, 48) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 82),I= 49, 54) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 82),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 82),I= 61, 66) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 82),I= 67, 72) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 82),I= 73, 78) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 82),I= 79, 84) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 82),I= 85, 90) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 82),I= 91, 96) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 82),I= 97,102) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 82),I=103,108) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 82),I=109,114) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 82),I=115,120) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=6481,6519) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280,884,-268,-232,884,1010,-268,
+     $ -106,38,-124,992,20,-160,-88,-268,56,-448,-124,992,-142,-124,2,
+     $ -16,20,-160/
 C     1 T(6,2,5,7,1,3,4)
-      DATA (CF(I, 83),I=  1,  6) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 83),I=  7, 12) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 83),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 83),I= 19, 24) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 83),I= 25, 30) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 83),I= 31, 36) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 83),I= 37, 42) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 83),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 83),I= 49, 54) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 83),I= 55, 60) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 83),I= 61, 66) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 83),I= 67, 72) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 83),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 83),I= 79, 84) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 83),I= 85, 90) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 83),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 83),I= 97,102) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 83),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 83),I=109,114) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 83),I=115,120) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=6520,6557) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16,-124,992,1028,1010,1136,-142,20,
+     $ -160,200,20,1280,-160,-88,-268,-106,-124,38,-142,-16,2,128,-16
+     $ ,20,2/
 C     1 T(6,2,7,1,5,3,4)
-      DATA (CF(I, 84),I=  1,  6) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 84),I=  7, 12) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 84),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 84),I= 19, 24) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 84),I= 25, 30) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 84),I= 31, 36) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 84),I= 37, 42) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 84),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 84),I= 49, 54) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 84),I= 55, 60) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 84),I= 61, 66) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 84),I= 67, 72) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 84),I= 73, 78) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 84),I= 79, 84) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 84),I= 85, 90) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 84),I= 91, 96) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 84),I= 97,102) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 84),I=103,108) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 84),I=109,114) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 84),I=115,120) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6558,6594) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128,-106,-124,-88,-268,-142,38,200,20,20,
+     $ -160,-160,1280,1028,1010,-124,992,-142,1136,2,20,-16,128,2,-16/
 C     1 T(6,2,7,5,1,3,4)
-      DATA (CF(I, 85),I=  1,  6) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 85),I=  7, 12) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 85),I= 13, 18) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 85),I= 19, 24) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 85),I= 25, 30) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 85),I= 31, 36) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 85),I= 37, 42) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 85),I= 43, 48) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 85),I= 49, 54) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 85),I= 55, 60) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 85),I= 61, 66) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 85),I= 67, 72) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 85),I= 73, 78) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 85),I= 79, 84) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 85),I= 85, 90) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 85),I= 91, 96) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 85),I= 97,102) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 85),I=103,108) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 85),I=109,114) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 85),I=115,120) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
+      DATA (CF(I),I=6595,6630) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160,-232,884,-88,56,-106,-124,272,-232,-232,-88,
+     $ -88,1028,1028,-124,-88,-106,20,200,-124,20,56,-124,2,20/
 C     1 T(6,5,1,2,7,3,4)
-      DATA (CF(I, 86),I=  1,  6) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 86),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 86),I= 13, 18) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 86),I= 19, 24) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 86),I= 25, 30) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 86),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 86),I= 37, 42) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 86),I= 43, 48) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 86),I= 49, 54) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 86),I= 55, 60) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 86),I= 61, 66) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 86),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 86),I= 73, 78) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 86),I= 79, 84) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 86),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 86),I= 91, 96) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 86),I= 97,102) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 86),I=103,108) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 86),I=109,114) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 86),I=115,120) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
+      DATA (CF(I),I=6631,6665) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16,-88,-268,56,-448,-124,992,-232,884,884,-268,-268
+     $ ,1010,-124,992,-106,38,-160,20,20,-160,-124,-142,-16,2/
 C     1 T(6,5,1,7,2,3,4)
-      DATA (CF(I, 87),I=  1,  6) /-6.913580246913580D-01
-     $ ,8.641975308641975D-02,1.530864197530864D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 87),I=  7, 12) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-1.635802469135803D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 87),I= 13, 18) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 87),I= 19, 24) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 87),I= 25, 30) /1.530864197530864D+00,
-     $ -1.913580246913580D-01,1.753086419753086D+00,
-     $ -2.191358024691358D-01,1.586419753086420D+00,1.558641975308642D
-     $ +00/
-      DATA (CF(I, 87),I= 31, 36) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 87),I= 37, 42) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 87),I= 43, 48) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 87),I= 49, 54) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 87),I= 55, 60) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 87),I= 61, 66) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 87),I= 67, 72) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 87),I= 73, 78) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I, 87),I= 79, 84) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 87),I= 85, 90) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 87),I= 91, 96) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 87),I= 97,102) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 87),I=103,108) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 87),I=109,114) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 87),I=115,120) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
+      DATA (CF(I),I=6666,6699) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16,272,-232,-232,-88,-88,1028,-232,884,-88,56,-106,-124,
+     $ -88,-106,1028,-124,200,20,56,-124,-124,20,20,2/
 C     1 T(6,5,2,1,7,3,4)
-      DATA (CF(I, 88),I=  1,  6) /8.641975308641975D-02,
-     $ -1.358024691358025D-01,-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 88),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.586419753086420D+00,3.086419753086420D-02,3.086419753086420D
-     $ -01,-1.358024691358025D-01,-1.635802469135803D-01/
-      DATA (CF(I, 88),I= 13, 18) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 88),I= 19, 24) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I, 88),I= 25, 30) /-1.913580246913580D-01,
-     $ -1.635802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02,-1.358024691358025D-01,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 88),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 88),I= 37, 42) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 88),I= 43, 48) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 88),I= 49, 54) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I, 88),I= 55, 60) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 88),I= 61, 66) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 88),I= 67, 72) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 88),I= 73, 78) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 88),I= 79, 84) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 88),I= 85, 90) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 88),I= 91, 96) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 88),I= 97,102) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I, 88),I=103,108) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 88),I=109,114) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 88),I=115,120) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6700,6732) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128,-232,884,884,-268,-268,1010,-88,-268,56,-448,-124,992,-106
+     $ ,38,-124,992,20,-160,-124,-142,20,-160,2,-16/
 C     1 T(6,5,2,7,1,3,4)
-      DATA (CF(I, 89),I=  1,  6) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 89),I=  7, 12) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 89),I= 13, 18) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 89),I= 19, 24) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 89),I= 25, 30) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 89),I= 31, 36) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 89),I= 37, 42) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 89),I= 43, 48) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 89),I= 49, 54) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 89),I= 55, 60) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 89),I= 61, 66) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 89),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 89),I= 73, 78) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 89),I= 79, 84) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 89),I= 85, 90) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 89),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 89),I= 97,102) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 89),I=103,108) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 89),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 89),I=115,120) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
+      DATA (CF(I),I=6733,6764) /4096,-1024,-16,128,-160,-16,-1024,128
+     $ ,1028,1010,-124,992,-142,1136,-88,-268,-106,-124,38,-142,20,
+     $ -160,200,20,1280,-160,2,-16,20,2,128,-16/
 C     1 T(6,5,7,1,2,3,4)
-      DATA (CF(I, 90),I=  1,  6) /-1.358024691358025D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01/
-      DATA (CF(I, 90),I=  7, 12) /-1.635802469135803D-01,
-     $ -1.358024691358025D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.586419753086420D+00,-1.913580246913580D-01/
-      DATA (CF(I, 90),I= 13, 18) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 90),I= 19, 24) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 90),I= 25, 30) /-4.135802469135803D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00/
-      DATA (CF(I, 90),I= 31, 36) /5.864197530864197D-02,
-     $ -1.635802469135803D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 90),I= 37, 42) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 90),I= 43, 48) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 90),I= 49, 54) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I, 90),I= 55, 60) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 90),I= 61, 66) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 90),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 90),I= 73, 78) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 90),I= 79, 84) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 90),I= 85, 90) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 90),I= 91, 96) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 90),I= 97,102) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I, 90),I=103,108) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I, 90),I=109,114) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 90),I=115,120) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6765,6795) /4096,-160,-16,-16,128,128,-1024,-88,
+     $ -268,-106,-124,38,-142,1028,1010,-124,992,-142,1136,200,20,20,
+     $ -160,-160,1280,20,2,2,-16,-16,128/
 C     1 T(6,5,7,2,1,3,4)
-      DATA (CF(I, 91),I=  1,  6) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 91),I=  7, 12) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 91),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 91),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 91),I= 25, 30) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 91),I= 31, 36) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 91),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 91),I= 43, 48) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 91),I= 49, 54) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 91),I= 55, 60) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 91),I= 61, 66) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 91),I= 67, 72) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 91),I= 73, 78) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 91),I= 79, 84) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 91),I= 85, 90) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 91),I= 91, 96) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 91),I= 97,102) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 91),I=103,108) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 91),I=109,114) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 91),I=115,120) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=6796,6825) /4096,-1024,-1024,128,128,1280,2,-16,20
+     $ ,2,128,-16,20,-160,-124,-142,-16,2,-124,20,56,-124,2,20,-1024
+     $ ,128,128,-16,-16,-160/
 C     1 T(6,7,1,2,5,3,4)
-      DATA (CF(I, 92),I=  1,  6) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 92),I=  7, 12) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 92),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 92),I= 19, 24) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 92),I= 25, 30) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 92),I= 31, 36) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 92),I= 37, 42) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 92),I= 43, 48) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 92),I= 49, 54) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 92),I= 55, 60) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 92),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 92),I= 67, 72) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 92),I= 73, 78) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 92),I= 79, 84) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 92),I= 85, 90) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 92),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 92),I= 97,102) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 92),I=103,108) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 92),I=109,114) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 92),I=115,120) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6826,6854) /4096,128,1280,-1024,128,20,2,2,-16,-16
+     $ ,128,-124,20,56,-124,2,20,20,-160,-124,-142,-16,2,128,-1024,-16
+     $ ,-160,128,-16/
 C     1 T(6,7,1,5,2,3,4)
-      DATA (CF(I, 93),I=  1,  6) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 93),I=  7, 12) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 93),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 93),I= 19, 24) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 93),I= 25, 30) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 93),I= 31, 36) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 93),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 93),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 93),I= 49, 54) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 93),I= 55, 60) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 93),I= 61, 66) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 93),I= 67, 72) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 93),I= 73, 78) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 93),I= 79, 84) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 93),I= 85, 90) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 93),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 93),I= 97,102) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 93),I=103,108) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 93),I=109,114) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 93),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=6855,6882) /4096,-1024,1280,128,20,-160,-124,-142,
+     $ -16,2,2,-16,20,2,128,-16,56,-124,-124,20,20,2,128,-16,-1024,128
+     $ ,-160,-16/
 C     1 T(6,7,2,1,5,3,4)
-      DATA (CF(I, 94),I=  1,  6) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 94),I=  7, 12) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 94),I= 13, 18) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 94),I= 19, 24) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 94),I= 25, 30) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 94),I= 31, 36) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 94),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 94),I= 43, 48) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 94),I= 49, 54) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 94),I= 55, 60) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 94),I= 61, 66) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 94),I= 67, 72) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 94),I= 73, 78) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 94),I= 79, 84) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 94),I= 85, 90) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 94),I= 91, 96) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 94),I= 97,102) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 94),I=103,108) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 94),I=109,114) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 94),I=115,120) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6883,6909) /4096,128,-1024,-124,20,56,-124,2,20,20
+     $ ,2,2,-16,-16,128,-124,-142,20,-160,2,-16,-16,-160,128,-1024,-16
+     $ ,128/
 C     1 T(6,7,2,5,1,3,4)
-      DATA (CF(I, 95),I=  1,  6) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 95),I=  7, 12) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 95),I= 13, 18) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 95),I= 19, 24) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 95),I= 25, 30) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 95),I= 31, 36) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 95),I= 37, 42) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 95),I= 43, 48) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 95),I= 49, 54) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 95),I= 55, 60) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 95),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 95),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 95),I= 73, 78) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 95),I= 79, 84) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 95),I= 85, 90) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 95),I= 91, 96) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I, 95),I= 97,102) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 95),I=103,108) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 95),I=109,114) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 95),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=6910,6935) /4096,-1024,-124,-142,20,-160,2,-16,56,
+     $ -124,-124,20,20,2,2,-16,20,2,128,-16,-16,128,-160,-16,-1024,128/
 C     1 T(6,7,5,1,2,3,4)
-      DATA (CF(I, 96),I=  1,  6) /-3.580246913580247D-01
-     $ ,4.197530864197531D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00,-3.580246913580247D-01,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 96),I=  7, 12) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 96),I= 13, 18) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 96),I= 19, 24) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 96),I= 25, 30) /1.364197530864198D+00,
-     $ -3.580246913580247D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 96),I= 31, 36) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,8.641975308641975D-02,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 96),I= 37, 42) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 96),I= 43, 48) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 96),I= 49, 54) /-4.135802469135803D-01,
-     $ -1.358024691358025D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 96),I= 55, 60) /1.558641975308642D+00
-     $ ,1.586419753086420D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I, 96),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 96),I= 67, 72) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 96),I= 73, 78) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 96),I= 79, 84) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 96),I= 85, 90) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I, 96),I= 91, 96) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I, 96),I= 97,102) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 96),I=103,108) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 96),I=109,114) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 96),I=115,120) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
+      DATA (CF(I),I=6936,6960) /4096,56,-124,-124,20,20,2,-124,-142,20
+     $ ,-160,2,-16,20,2,2,-16,-16,128,-160,-16,-16,128,128,-1024/
 C     1 T(6,7,5,2,1,3,4)
-      DATA (CF(I, 97),I=  1,  6) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 97),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 97),I= 13, 18) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 97),I= 19, 24) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 97),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 97),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 97),I= 37, 42) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 97),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 97),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 97),I= 55, 60) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 97),I= 61, 66) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 97),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 97),I= 73, 78) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 97),I= 79, 84) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 97),I= 85, 90) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 97),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 97),I= 97,102) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I, 97),I=103,108) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 97),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 97),I=115,120) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
+      DATA (CF(I),I=6961,6984) /4096,-1024,-1024,128,128,1280,-1024
+     $ ,128,128,-16,-16,-160,128,-16,1280,-160,1136,992,-16,-160,-160
+     $ ,992,992,-448/
 C     1 T(7,1,2,5,6,3,4)
-      DATA (CF(I, 98),I=  1,  6) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I, 98),I=  7, 12) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 98),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 98),I= 19, 24) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 98),I= 25, 30) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I, 98),I= 31, 36) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 98),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 98),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I, 98),I= 49, 54) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I, 98),I= 55, 60) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 98),I= 61, 66) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 98),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I, 98),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 98),I= 79, 84) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 98),I= 85, 90) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 98),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 98),I= 97,102) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I, 98),I=103,108) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 98),I=109,114) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I, 98),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
+      DATA (CF(I),I=6985,7007) /4096,128,1280,-1024,128,128,-1024,-16,
+     $ -160,128,-16,-16,-160,-160,992,992,-448,128,-16,1280,-160,1136
+     $ ,992/
 C     1 T(7,1,2,6,5,3,4)
-      DATA (CF(I, 99),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I, 99),I=  7, 12) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I, 99),I= 13, 18) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I, 99),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I, 99),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I, 99),I= 31, 36) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 99),I= 37, 42) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I, 99),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 99),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I, 99),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I, 99),I= 61, 66) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I, 99),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 99),I= 73, 78) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I, 99),I= 79, 84) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I, 99),I= 85, 90) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I, 99),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I, 99),I= 97,102) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I, 99),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I, 99),I=109,114) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I, 99),I=115,120) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=7008,7029) /4096,-1024,1280,128,128,-16,1280,-160
+     $ ,1136,992,-1024,128,128,-16,-16,-160,-160,-16,992,-448,-160,992/
 C     1 T(7,1,5,2,6,3,4)
-      DATA (CF(I,100),I=  1,  6) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,100),I=  7, 12) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,100),I= 13, 18) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,100),I= 19, 24) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,100),I= 25, 30) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,100),I= 31, 36) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,100),I= 37, 42) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,100),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,100),I= 49, 54) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,100),I= 55, 60) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,100),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,100),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,100),I= 73, 78) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,100),I= 79, 84) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,100),I= 85, 90) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,100),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,100),I= 97,102) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,100),I=103,108) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,100),I=109,114) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,100),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
+      DATA (CF(I),I=7030,7050) /4096,128,-1024,-16,-160,-160,992,992,
+     $ -448,128,-1024,-16,-160,128,-16,-16,128,1136,992,1280,-160/
 C     1 T(7,1,5,6,2,3,4)
-      DATA (CF(I,101),I=  1,  6) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,101),I=  7, 12) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,101),I= 13, 18) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,101),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,101),I= 25, 30) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,101),I= 31, 36) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,101),I= 37, 42) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,101),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,101),I= 49, 54) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,101),I= 55, 60) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,101),I= 61, 66) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,101),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,101),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,101),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,101),I= 85, 90) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,101),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,101),I= 97,102) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,101),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,101),I=109,114) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,101),I=115,120) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=7051,7070) /4096,-1024,-16,128,1136,992,1280,-160,
+     $ -160,-16,992,-448,-160,992,-1024,128,128,-16,-16,-160/
 C     1 T(7,1,6,2,5,3,4)
-      DATA (CF(I,102),I=  1,  6) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,102),I=  7, 12) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,102),I= 13, 18) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,102),I= 19, 24) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,102),I= 25, 30) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,102),I= 31, 36) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,102),I= 37, 42) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,102),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,102),I= 49, 54) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,102),I= 55, 60) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,102),I= 61, 66) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,102),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,102),I= 73, 78) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,102),I= 79, 84) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,102),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,102),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,102),I= 97,102) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,102),I=103,108) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,102),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,102),I=115,120) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=7071,7089) /4096,-160,-16,992,-448,-160,992,-16
+     $ ,128,1136,992,1280,-160,128,-1024,-16,-160,128,-16/
 C     1 T(7,1,6,5,2,3,4)
-      DATA (CF(I,103),I=  1,  6) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,103),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,103),I= 13, 18) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,103),I= 19, 24) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,103),I= 25, 30) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,103),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,103),I= 37, 42) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,103),I= 43, 48) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,103),I= 49, 54) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,103),I= 55, 60) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,103),I= 61, 66) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,103),I= 67, 72) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,103),I= 73, 78) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,103),I= 79, 84) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,103),I= 85, 90) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,103),I= 91, 96) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,103),I= 97,102) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,103),I=103,108) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,103),I=109,114) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,103),I=115,120) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
+      DATA (CF(I),I=7090,7107) /4096,-1024,-1024,128,128,1280,1280,
+     $ -160,128,-16,992,1136,-160,992,-16,-160,-448,992/
 C     1 T(7,2,1,5,6,3,4)
-      DATA (CF(I,104),I=  1,  6) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,104),I=  7, 12) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,104),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,104),I= 19, 24) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,-2.469135802469136D-02,3.086419753086420D-03/
-      DATA (CF(I,104),I= 25, 30) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,104),I= 31, 36) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,104),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,104),I= 43, 48) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,104),I= 49, 54) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,104),I= 55, 60) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,104),I= 61, 66) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,104),I= 67, 72) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,104),I= 73, 78) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,104),I= 79, 84) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,104),I= 85, 90) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,104),I= 91, 96) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,104),I= 97,102) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,104),I=103,108) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,104),I=109,114) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,104),I=115,120) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
+      DATA (CF(I),I=7108,7124) /4096,128,1280,-1024,128,-160,992,-16,
+     $ -160,-448,992,1280,-160,128,-16,992,1136/
 C     1 T(7,2,1,6,5,3,4)
-      DATA (CF(I,105),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,105),I=  7, 12) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,105),I= 13, 18) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,105),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,105),I= 25, 30) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,105),I= 31, 36) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,105),I= 37, 42) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,105),I= 43, 48) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,105),I= 49, 54) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,105),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,105),I= 61, 66) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,105),I= 67, 72) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,105),I= 73, 78) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,105),I= 79, 84) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,105),I= 85, 90) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,105),I= 91, 96) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,105),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00/
-      DATA (CF(I,105),I=103,108) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,105),I=109,114) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,105),I=115,120) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=7125,7140) /4096,-1024,1280,128,128,-16,-1024,128,
+     $ -160,-16,992,-448,-160,-16,992,-160/
 C     1 T(7,2,5,1,6,3,4)
-      DATA (CF(I,106),I=  1,  6) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,106),I=  7, 12) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,106),I= 13, 18) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,106),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,8.641975308641975D-02/
-      DATA (CF(I,106),I= 25, 30) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,106),I= 31, 36) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,106),I= 37, 42) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,106),I= 43, 48) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,106),I= 49, 54) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,106),I= 55, 60) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,106),I= 61, 66) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,106),I= 67, 72) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,106),I= 73, 78) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,106),I= 79, 84) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,106),I= 85, 90) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,106),I= 91, 96) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,106),I= 97,102) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -6.913580246913580D-01/
-      DATA (CF(I,106),I=103,108) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,106),I=109,114) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,106),I=115,120) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
+      DATA (CF(I),I=7141,7155) /4096,128,-1024,-16,-160,128,-1024,-16
+     $ ,128,1136,992,-16,128,-160,1280/
 C     1 T(7,2,5,6,1,3,4)
-      DATA (CF(I,107),I=  1,  6) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,107),I=  7, 12) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,107),I= 13, 18) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,107),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.191358024691358D-01,
-     $ -1.913580246913580D-01,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,107),I= 25, 30) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,107),I= 31, 36) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,107),I= 37, 42) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,107),I= 43, 48) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,107),I= 49, 54) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,107),I= 55, 60) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,107),I= 61, 66) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,107),I= 67, 72) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,107),I= 73, 78) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,107),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,107),I= 85, 90) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,107),I= 91, 96) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,107),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.753086419753086D+00,1.530864197530864D
-     $ +00,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,107),I=103,108) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,107),I=109,114) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,107),I=115,120) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=7156,7169) /4096,-1024,992,-448,-160,-16,992,-160
+     $ ,128,-16,-1024,128,-160,-16/
 C     1 T(7,2,6,1,5,3,4)
-      DATA (CF(I,108),I=  1,  6) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,108),I=  7, 12) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,108),I= 13, 18) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,108),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,108),I= 25, 30) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,108),I= 31, 36) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,108),I= 37, 42) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,108),I= 43, 48) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,108),I= 49, 54) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,108),I= 55, 60) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,108),I= 61, 66) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,108),I= 67, 72) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,108),I= 73, 78) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,108),I= 79, 84) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,108),I= 85, 90) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,108),I= 91, 96) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,108),I= 97,102) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,108),I=103,108) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,108),I=109,114) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,108),I=115,120) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=7170,7182) /4096,1136,992,-16,128,-160,1280,-16,
+     $ -160,128,-1024,-16,128/
 C     1 T(7,2,6,5,1,3,4)
-      DATA (CF(I,109),I=  1,  6) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,109),I=  7, 12) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,109),I= 13, 18) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,109),I= 19, 24) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,109),I= 25, 30) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,109),I= 31, 36) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,109),I= 37, 42) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,109),I= 43, 48) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,109),I= 49, 54) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,109),I= 55, 60) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,109),I= 61, 66) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,109),I= 67, 72) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,109),I= 73, 78) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,109),I= 79, 84) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,109),I= 85, 90) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,109),I= 91, 96) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,109),I= 97,102) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,109),I=103,108) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,109),I=109,114) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
-      DATA (CF(I,109),I=115,120) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
+      DATA (CF(I),I=7183,7194) /4096,-1024,-1024,128,128,1280,992,-160
+     $ ,-448,992,-16,-160/
 C     1 T(7,5,1,2,6,3,4)
-      DATA (CF(I,110),I=  1,  6) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,110),I=  7, 12) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,110),I= 13, 18) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,110),I= 19, 24) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,110),I= 25, 30) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,110),I= 31, 36) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,110),I= 37, 42) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,110),I= 43, 48) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,110),I= 49, 54) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,110),I= 55, 60) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,110),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,110),I= 67, 72) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,110),I= 73, 78) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,110),I= 79, 84) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,110),I= 85, 90) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,110),I= 91, 96) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,110),I= 97,102) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,110),I=103,108) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,110),I=109,114) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
-      DATA (CF(I,110),I=115,120) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
+      DATA (CF(I),I=7195,7205) /4096,128,1280,-1024,128,-160,1280,992
+     $ ,1136,128,-16/
 C     1 T(7,5,1,6,2,3,4)
-      DATA (CF(I,111),I=  1,  6) /8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,1.530864197530864D+00,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,111),I=  7, 12) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,111),I= 13, 18) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,111),I= 19, 24) /-2.469135802469136D-01
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,3.086419753086420D-03,-1.913580246913580D-01,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,111),I= 25, 30) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,1.586419753086420D+00,1.558641975308642D
-     $ +00,1.753086419753086D+00,-2.191358024691358D-01/
-      DATA (CF(I,111),I= 31, 36) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,111),I= 37, 42) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,111),I= 43, 48) /-2.469135802469136D-02
-     $ ,3.086419753086420D-03,1.975308641975309D-01,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,111),I= 49, 54) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,111),I= 55, 60) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,111),I= 61, 66) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,111),I= 67, 72) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,111),I= 73, 78) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,111),I= 79, 84) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,111),I= 85, 90) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,111),I= 91, 96) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,111),I= 97,102) /1.975308641975309D+00,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,1.753086419753086D
-     $ +00/
-      DATA (CF(I,111),I=103,108) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,111),I=109,114) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
-      DATA (CF(I,111),I=115,120) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
+      DATA (CF(I),I=7206,7215) /4096,-1024,1280,128,-448,992,992,-160,
+     $ -160,-16/
 C     1 T(7,5,2,1,6,3,4)
-      DATA (CF(I,112),I=  1,  6) /-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.913580246913580D-01,
-     $ -1.635802469135803D-01/
-      DATA (CF(I,112),I=  7, 12) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,112),I= 13, 18) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,112),I= 19, 24) /3.086419753086420D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02,8.641975308641975D-02,-1.913580246913580D-01/
-      DATA (CF(I,112),I= 25, 30) /-1.635802469135803D-01,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-2.191358024691358D-01
-     $ ,5.864197530864197D-02/
-      DATA (CF(I,112),I= 31, 36) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,112),I= 37, 42) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,112),I= 43, 48) /3.086419753086420D-03
-     $ ,3.086419753086420D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,112),I= 49, 54) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,112),I= 55, 60) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,112),I= 61, 66) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,112),I= 67, 72) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,112),I= 73, 78) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,112),I= 79, 84) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,112),I= 85, 90) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,112),I= 91, 96) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,112),I= 97,102) /-2.469135802469136D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,112),I=103,108) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,112),I=109,114) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,112),I=115,120) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=7216,7224) /4096,128,-1024,992,1136,-160,1280,-16
+     $ ,128/
 C     1 T(7,5,2,6,1,3,4)
-      DATA (CF(I,113),I=  1,  6) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,113),I=  7, 12) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,113),I= 13, 18) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,113),I= 19, 24) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,113),I= 25, 30) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,113),I= 31, 36) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,113),I= 37, 42) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,113),I= 43, 48) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,113),I= 49, 54) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,113),I= 55, 60) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,113),I= 61, 66) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,113),I= 67, 72) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,113),I= 73, 78) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,113),I= 79, 84) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,113),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,113),I= 91, 96) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,113),I= 97,102) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,113),I=103,108) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,113),I=109,114) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
-      DATA (CF(I,113),I=115,120) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
+      DATA (CF(I),I=7225,7232) /4096,-1024,-16,128,-160,-16,-1024,128/
 C     1 T(7,5,6,1,2,3,4)
-      DATA (CF(I,114),I=  1,  6) /-3.580246913580247D-01,
-     $ -1.358024691358025D-01,4.197530864197531D-01,
-     $ -3.580246913580247D-01,1.586419753086420D+00,
-     $ -1.358024691358025D-01/
-      DATA (CF(I,114),I=  7, 12) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,114),I= 13, 18) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,114),I= 19, 24) /-1.913580246913580D-01
-     $ ,8.641975308641975D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,-1.913580246913580D-01,3.086419753086420D-02/
-      DATA (CF(I,114),I= 25, 30) /1.364197530864198D+00,
-     $ -4.135802469135803D-01,-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.558641975308642D+00,
-     $ -4.135802469135803D-01/
-      DATA (CF(I,114),I= 31, 36) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,114),I= 37, 42) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,114),I= 43, 48) /-2.191358024691358D-01,
-     $ -1.913580246913580D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,114),I= 49, 54) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,114),I= 55, 60) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,114),I= 61, 66) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,114),I= 67, 72) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,114),I= 73, 78) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,114),I= 79, 84) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,114),I= 85, 90) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,114),I= 91, 96) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,114),I= 97,102) /1.530864197530864D+00,
-     $ -6.913580246913580D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,1.530864197530864D+00,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,114),I=103,108) /1.753086419753086D+00
-     $ ,1.530864197530864D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,114),I=109,114) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
-      DATA (CF(I,114),I=115,120) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
+      DATA (CF(I),I=7233,7239) /4096,-160,-16,-16,128,128,-1024/
 C     1 T(7,5,6,2,1,3,4)
-      DATA (CF(I,115),I=  1,  6) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,115),I=  7, 12) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,115),I= 13, 18) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,115),I= 19, 24) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,115),I= 25, 30) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,115),I= 31, 36) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,115),I= 37, 42) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,115),I= 43, 48) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,115),I= 49, 54) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,115),I= 55, 60) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,115),I= 61, 66) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,115),I= 67, 72) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,115),I= 73, 78) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,115),I= 79, 84) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,115),I= 85, 90) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,115),I= 91, 96) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,115),I= 97,102) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,115),I=103,108) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,115),I=109,114) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,115),I=115,120) /1.264197530864197D+01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,1.975308641975309D
-     $ +00/
+      DATA (CF(I),I=7240,7245) /4096,-1024,-1024,128,128,1280/
 C     1 T(7,6,1,2,5,3,4)
-      DATA (CF(I,116),I=  1,  6) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,116),I=  7, 12) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,116),I= 13, 18) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,116),I= 19, 24) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,116),I= 25, 30) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,116),I= 31, 36) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,116),I= 37, 42) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,116),I= 43, 48) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,116),I= 49, 54) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,116),I= 55, 60) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,116),I= 61, 66) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,116),I= 67, 72) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,116),I= 73, 78) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,116),I= 79, 84) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,116),I= 85, 90) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,116),I= 91, 96) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,116),I= 97,102) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,116),I=103,108) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,116),I=109,114) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,116),I=115,120) /-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,1.975308641975309D
-     $ +00,-1.580246913580247D+00,1.975308641975309D-01/
+      DATA (CF(I),I=7246,7250) /4096,128,1280,-1024,128/
 C     1 T(7,6,1,5,2,3,4)
-      DATA (CF(I,117),I=  1,  6) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,117),I=  7, 12) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,117),I= 13, 18) /-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-2.469135802469136D-01
-     $ ,3.086419753086420D-02/
-      DATA (CF(I,117),I= 19, 24) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,117),I= 25, 30) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,117),I= 31, 36) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,117),I= 37, 42) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,117),I= 43, 48) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,117),I= 49, 54) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,117),I= 55, 60) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,117),I= 61, 66) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,117),I= 67, 72) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,117),I= 73, 78) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,-1.913580246913580D-01,
-     $ -2.191358024691358D-01,-2.469135802469136D-02
-     $ ,3.086419753086420D-03/
-      DATA (CF(I,117),I= 79, 84) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,117),I= 85, 90) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,117),I= 91, 96) /1.975308641975309D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,117),I= 97,102) /-2.469135802469136D-01
-     $ ,1.975308641975309D+00,1.530864197530864D+00,1.753086419753086D
-     $ +00,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,117),I=103,108) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,117),I=109,114) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,117),I=115,120) /-1.580246913580247D+00
-     $ ,1.975308641975309D-01,1.264197530864197D+01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01/
+      DATA (CF(I),I=7251,7254) /4096,-1024,1280,128/
 C     1 T(7,6,2,1,5,3,4)
-      DATA (CF(I,118),I=  1,  6) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,118),I=  7, 12) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,118),I= 13, 18) /1.586419753086420D+00,
-     $ -1.913580246913580D-01,-1.358024691358025D-01,
-     $ -1.635802469135803D-01,3.086419753086420D-02,3.086419753086420D
-     $ -01/
-      DATA (CF(I,118),I= 19, 24) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,118),I= 25, 30) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,118),I= 31, 36) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,118),I= 37, 42) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,118),I= 43, 48) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I= 49, 54) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,118),I= 55, 60) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,118),I= 61, 66) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,118),I= 67, 72) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,118),I= 73, 78) /-1.913580246913580D-01
-     $ ,3.086419753086420D-02,8.641975308641975D-02,
-     $ -1.913580246913580D-01,3.086419753086420D-03,3.086419753086420D
-     $ -02/
-      DATA (CF(I,118),I= 79, 84) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I= 85, 90) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,118),I= 91, 96) /-2.469135802469136D-02,
-     $ -2.469135802469136D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I= 97,102) /1.530864197530864D+00,
-     $ -2.469135802469136D-01,-6.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.469135802469136D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,118),I=103,108) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,118),I=109,114) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,118),I=115,120) /1.975308641975309D-01
-     $ ,1.975308641975309D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
+      DATA (CF(I),I=7255,7257) /4096,128,-1024/
 C     1 T(7,6,2,5,1,3,4)
-      DATA (CF(I,119),I=  1,  6) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,119),I=  7, 12) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,119),I= 13, 18) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,119),I= 19, 24) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,119),I= 25, 30) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,119),I= 31, 36) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,119),I= 37, 42) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,119),I= 43, 48) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,119),I= 49, 54) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,119),I= 55, 60) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,119),I= 61, 66) /3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02,1.975308641975309D+00,-2.469135802469136D-01/
-      DATA (CF(I,119),I= 67, 72) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,119),I= 73, 78) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,119),I= 79, 84) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,119),I= 85, 90) /3.086419753086420D-03,
-     $ -2.469135802469136D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03,1.975308641975309D-01,-2.469135802469136D-02/
-      DATA (CF(I,119),I= 91, 96) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,119),I= 97,102) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,119),I=103,108) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,119),I=109,114) /-2.469135802469136D-02
-     $ ,1.975308641975309D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-1.580246913580247D+00
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,119),I=115,120) /1.975308641975309D-01,
-     $ -1.580246913580247D+00,1.975308641975309D+00,1.975308641975309D
-     $ -01,1.264197530864197D+01,-1.580246913580247D+00/
+      DATA (CF(I),I=7258,7259) /4096,-1024/
 C     1 T(7,6,5,1,2,3,4)
-      DATA (CF(I,120),I=  1,  6) /4.197530864197531D-01,
-     $ -3.580246913580247D-01,-3.580246913580247D-01,
-     $ -1.358024691358025D-01,-1.358024691358025D-01
-     $ ,1.586419753086420D+00/
-      DATA (CF(I,120),I=  7, 12) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,-1.358024691358025D-01
-     $ ,8.641975308641975D-02,-1.635802469135803D-01,
-     $ -1.913580246913580D-01/
-      DATA (CF(I,120),I= 13, 18) /-1.358024691358025D-01,
-     $ -1.635802469135803D-01,1.586419753086420D+00,
-     $ -1.913580246913580D-01,3.086419753086420D-01,3.086419753086420D
-     $ -02/
-      DATA (CF(I,120),I= 19, 24) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,120),I= 25, 30) /-3.580246913580247D-01
-     $ ,1.364197530864198D+00,1.364197530864198D+00,
-     $ -4.135802469135803D-01,-4.135802469135803D-01
-     $ ,1.558641975308642D+00/
-      DATA (CF(I,120),I= 31, 36) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,8.641975308641975D-02,
-     $ -6.913580246913580D-01,-1.913580246913580D-01
-     $ ,1.530864197530864D+00/
-      DATA (CF(I,120),I= 37, 42) /-1.635802469135803D-01
-     $ ,5.864197530864197D-02,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,3.086419753086420D-02,
-     $ -2.469135802469136D-01/
-      DATA (CF(I,120),I= 43, 48) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,120),I= 49, 54) /-1.358024691358025D-01,
-     $ -4.135802469135803D-01,-1.635802469135803D-01,
-     $ -1.913580246913580D-01,5.864197530864197D-02,
-     $ -2.191358024691358D-01/
-      DATA (CF(I,120),I= 55, 60) /1.586419753086420D+00
-     $ ,1.558641975308642D+00,-1.913580246913580D-01
-     $ ,1.530864197530864D+00,-2.191358024691358D-01
-     $ ,1.753086419753086D+00/
-      DATA (CF(I,120),I= 61, 66) /3.086419753086420D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,
-     $ -2.469135802469136D-01,-2.469135802469136D-01
-     $ ,1.975308641975309D+00/
-      DATA (CF(I,120),I= 67, 72) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,120),I= 73, 78) /8.641975308641975D-02,
-     $ -1.913580246913580D-01,-1.913580246913580D-01
-     $ ,3.086419753086420D-02,3.086419753086420D-02,3.086419753086420D
-     $ -03/
-      DATA (CF(I,120),I= 79, 84) /-1.913580246913580D-01,
-     $ -2.191358024691358D-01,3.086419753086420D-02,
-     $ -2.469135802469136D-01,3.086419753086420D-03,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,120),I= 85, 90) /3.086419753086420D-02
-     $ ,3.086419753086420D-03,3.086419753086420D-03,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,120),I= 91, 96) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,120),I= 97,102) /-6.913580246913580D-01
-     $ ,1.530864197530864D+00,1.530864197530864D+00,
-     $ -2.469135802469136D-01,-2.469135802469136D-01,
-     $ -2.469135802469136D-02/
-      DATA (CF(I,120),I=103,108) /1.530864197530864D+00
-     $ ,1.753086419753086D+00,-2.469135802469136D-01
-     $ ,1.975308641975309D+00,-2.469135802469136D-02
-     $ ,1.975308641975309D-01/
-      DATA (CF(I,120),I=109,114) /-2.469135802469136D-01,
-     $ -2.469135802469136D-02,-2.469135802469136D-02
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00/
-      DATA (CF(I,120),I=115,120) /1.975308641975309D+00
-     $ ,1.975308641975309D-01,1.975308641975309D-01,
-     $ -1.580246913580247D+00,-1.580246913580247D+00
-     $ ,1.264197530864197D+01/
+      DATA (CF(I),I=7260,7260) /4096/
 C     1 T(7,6,5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -18811,10 +10175,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -18823,6 +10189,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/madevent b/epochX/cudacpp/gg_ttggg.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/madevent
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
index 53dd560ed6..c30f753dcb 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -963,7 +963,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -976,7 +976,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -990,7 +990,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1003,7 +1003,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1017,7 +1017,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -1030,7 +1030,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1053,7 +1053,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1066,7 +1066,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1091,7 +1091,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1104,7 +1104,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1116,7 +1116,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1129,7 +1129,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1148,7 +1148,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1161,7 +1161,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1180,7 +1180,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1193,7 +1193,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1211,7 +1211,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1226,7 +1226,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
@@ -1241,7 +1241,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1256,7 +1256,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1276,7 +1276,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -1291,7 +1291,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1306,7 +1306,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1321,7 +1321,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1341,7 +1341,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1356,7 +1356,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1371,7 +1371,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1386,7 +1386,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 37d3314a5d..a9c9e37bd0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -58,7 +58,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005965471267700195 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055277347564697266 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -151,33 +151,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.863 s
+1 processes with 1240 diagrams generated in 1.896 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.535 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.539 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.348 s
+ALOHA: aloha creates 5 routines in  0.343 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -190,17 +190,17 @@ ALOHA: aloha creates 5 routines in  0.348 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m12.948s
-user	0m12.781s
-sys	0m0.107s
+real	0m12.885s
+user	0m12.736s
+sys	0m0.086s
 Code generation completed in 13 seconds
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
index 07099839d3..6b4b8dc8ce 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 120;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,31903 +279,2613 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 2 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 3 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 3
-      // (none)
-
-      // Amplitude(s) for diagram number 3
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 4 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] );
-
-      // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 5 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 6 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 7 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 7
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 7
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 8 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 9 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 10 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
-
-      // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 11 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 11
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] );
-
-      // Amplitude(s) for diagram number 11
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 12 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 12
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 13 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 13
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 13
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 14 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 14
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] );
-
-      // Amplitude(s) for diagram number 14
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 15 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 15
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] );
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 16
-      // (none)
-
-      // Amplitude(s) for diagram number 16
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 17 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 17
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] );
-
-      // Amplitude(s) for diagram number 17
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 18 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 19 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 19
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] );
-
-      // Amplitude(s) for diagram number 19
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 20 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 21 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 21
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
-
-      // Amplitude(s) for diagram number 21
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 22 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 23 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 23
-      // (none)
-
-      // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 24 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 24
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] );
-
-      // Amplitude(s) for diagram number 24
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 25 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 25
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] );
-
-      // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 26 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 1240 ***
 
-      // Wavefunction(s) for diagram number 27
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 28
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] );
-
-      // Amplitude(s) for diagram number 28
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 30 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 30
-      // (none)
-
-      // Amplitude(s) for diagram number 30
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 31
-      // (none)
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-
-      // *** DIAGRAM 32 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 32
-      // (none)
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
-
-      // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-
-      // *** DIAGRAM 36 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 36
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] );
-
-      // Amplitude(s) for diagram number 36
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 37 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 37
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] );
-
-      // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 38 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 38
-      // (none)
-
-      // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-
-      // *** DIAGRAM 39 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 39
-      // (none)
-
-      // Amplitude(s) for diagram number 39
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 40 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 40
-      // (none)
-
-      // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 41 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 41
-      // (none)
-
-      // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 42 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 42
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] );
-
-      // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 43 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 43
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] );
-
-      // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 44 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 44
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] );
-
-      // Amplitude(s) for diagram number 44
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 45 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 45
-      // (none)
-
-      // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 46 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 46
-      // (none)
-
-      // Amplitude(s) for diagram number 46
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 47 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 47
-      // (none)
-
-      // Amplitude(s) for diagram number 47
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 48 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 48
-      // (none)
-
-      // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 49 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 49
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
-
-      // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 50 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 50
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
-
-      // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 51 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 51
-      // (none)
-
-      // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-
-      // *** DIAGRAM 52 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 52
-      // (none)
-
-      // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 53 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 53
-      // (none)
-
-      // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 54 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 54
-      // (none)
-
-      // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 55 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 55
-      // (none)
-
-      // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 56 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 56
-      // (none)
-
-      // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 57 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 57
-      // (none)
-
-      // Amplitude(s) for diagram number 57
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 58 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 58
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] );
-
-      // Amplitude(s) for diagram number 58
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 59 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 59
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] );
-
-      // Amplitude(s) for diagram number 59
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 60 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 60
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] );
-
-      // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 61 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 61
-      // (none)
-
-      // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 62 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 62
-      // (none)
-
-      // Amplitude(s) for diagram number 62
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 63 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 63
-      // (none)
-
-      // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 64 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 64
-      // (none)
-
-      // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 65 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 65
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
-
-      // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 66 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 66
-      // (none)
-
-      // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 67 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 67
-      // (none)
-
-      // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 68 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 68
-      // (none)
-
-      // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 69 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 69
-      // (none)
-
-      // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 70 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 70
-      // (none)
-
-      // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 71 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 71
-      // (none)
-
-      // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 72 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 72
-      // (none)
-
-      // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 73 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 73
-      // (none)
-
-      // Amplitude(s) for diagram number 73
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 74 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 74
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
-
-      // Amplitude(s) for diagram number 74
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 75 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 75
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] );
-
-      // Amplitude(s) for diagram number 75
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 76 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 76
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] );
-
-      // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 77 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 77
-      // (none)
-
-      // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 78 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 78
-      // (none)
-
-      // Amplitude(s) for diagram number 78
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 79 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 79
-      // (none)
-
-      // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 80 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 80
-      // (none)
-
-      // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 81 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 81
-      // (none)
-
-      // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 82 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 82
-      // (none)
-
-      // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 83 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 83
-      // (none)
-
-      // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 84 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] );
-
-      // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 85 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 85
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
-
-      // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 86 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 86
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 86
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 87 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 87
-      // (none)
-
-      // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 88 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 88
-      // (none)
-
-      // Amplitude(s) for diagram number 88
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 89 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 89
-      // (none)
-
-      // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-
-      // *** DIAGRAM 90 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 90
-      // (none)
-
-      // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 91 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 91
-      // (none)
-
-      // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-
-      // *** DIAGRAM 92 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 92
-      // (none)
-
-      // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 93 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 93
-      // (none)
-
-      // Amplitude(s) for diagram number 93
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 94 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 94
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] );
-
-      // Amplitude(s) for diagram number 94
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 95 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 95
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
-
-      // Amplitude(s) for diagram number 95
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 96 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 96
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] );
-
-      // Amplitude(s) for diagram number 96
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 97 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 97
-      // (none)
-
-      // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-
-      // *** DIAGRAM 98 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 98
-      // (none)
-
-      // Amplitude(s) for diagram number 98
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 99 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 99
-      // (none)
-
-      // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-
-      // *** DIAGRAM 100 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 100
-      // (none)
-
-      // Amplitude(s) for diagram number 100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 101 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 101
-      // (none)
-
-      // Amplitude(s) for diagram number 101
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-
-      // *** DIAGRAM 102 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 102
-      // (none)
-
-      // Amplitude(s) for diagram number 102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 103 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 103
-      // (none)
-
-      // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 104 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 104
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] );
-
-      // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 105 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 105
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] );
-
-      // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 106 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 106
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-
-      // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 107 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 107
-      // (none)
-
-      // Amplitude(s) for diagram number 107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 108 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 108
-      // (none)
-
-      // Amplitude(s) for diagram number 108
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 109 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 109
-      // (none)
-
-      // Amplitude(s) for diagram number 109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 110 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 110
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-
-      // *** DIAGRAM 111 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 111
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 112 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 112
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
-
-      // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 113 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 113
-      // (none)
-
-      // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 114 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 114
-      // (none)
-
-      // Amplitude(s) for diagram number 114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 115 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 115
-      // (none)
-
-      // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 116 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 116
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-
-      // *** DIAGRAM 117 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 117
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] );
-
-      // Amplitude(s) for diagram number 117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 118 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 118
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] );
-
-      // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 119 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 119
-      // (none)
-
-      // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 120 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 120
-      // (none)
-
-      // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 121 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 121
-      // (none)
-
-      // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 122 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 122
-      // (none)
-
-      // Amplitude(s) for diagram number 122
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 123 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 123
-      // (none)
-
-      // Amplitude(s) for diagram number 123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 124 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 124
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 124
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 125 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 125
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 125
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-
-      // *** DIAGRAM 126 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 126
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] );
-
-      // Amplitude(s) for diagram number 126
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 127 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 127
-      // (none)
-
-      // Amplitude(s) for diagram number 127
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= amp_sv[0];
-
-      // *** DIAGRAM 128 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 128
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] );
-
-      // Amplitude(s) for diagram number 128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 129 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 129
-      // (none)
-
-      // Amplitude(s) for diagram number 129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 130 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 130
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] );
-
-      // Amplitude(s) for diagram number 130
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 131 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 131
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
-
-      // Amplitude(s) for diagram number 131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 132 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 132
-      // (none)
-
-      // Amplitude(s) for diagram number 132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 133 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 133
-      // (none)
-
-      // Amplitude(s) for diagram number 133
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 134 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-
-      // Amplitude(s) for diagram number 134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 135 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 135
-      // (none)
-
-      // Amplitude(s) for diagram number 135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 136 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 136
-      // (none)
-
-      // Amplitude(s) for diagram number 136
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 137 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 137
-      // (none)
-
-      // Amplitude(s) for diagram number 137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 138 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 138
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
-
-      // Amplitude(s) for diagram number 138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 139 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 139
-      // (none)
-
-      // Amplitude(s) for diagram number 139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 140 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 140
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] );
-
-      // Amplitude(s) for diagram number 140
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 141 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] );
-
-      // Amplitude(s) for diagram number 141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 142 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 142
-      // (none)
-
-      // Amplitude(s) for diagram number 142
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 143 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 143
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] );
-
-      // Amplitude(s) for diagram number 143
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 144 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 144
-      // (none)
-
-      // Amplitude(s) for diagram number 144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 145 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 145
-      // (none)
-
-      // Amplitude(s) for diagram number 145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 146 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 146
-      // (none)
-
-      // Amplitude(s) for diagram number 146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 147 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 147
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
-
-      // Amplitude(s) for diagram number 147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 148 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 148
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] );
-
-      // Amplitude(s) for diagram number 148
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 149 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 149
-      // (none)
-
-      // Amplitude(s) for diagram number 149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 150 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 150
-      // (none)
-
-      // Amplitude(s) for diagram number 150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 151 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 151
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 151
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 152 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 152
-      // (none)
-
-      // Amplitude(s) for diagram number 152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 153 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 153
-      // (none)
-
-      // Amplitude(s) for diagram number 153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-
-      // *** DIAGRAM 154 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 154
-      // (none)
-
-      // Amplitude(s) for diagram number 154
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 155 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 155
-      // (none)
-
-      // Amplitude(s) for diagram number 155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 156 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 156
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] );
-
-      // Amplitude(s) for diagram number 156
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 157 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] );
-
-      // Amplitude(s) for diagram number 157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 158 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 158
-      // (none)
-
-      // Amplitude(s) for diagram number 158
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 159 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 159
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-
-      // Amplitude(s) for diagram number 159
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 160 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 160
-      // (none)
-
-      // Amplitude(s) for diagram number 160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 161 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 161
-      // (none)
-
-      // Amplitude(s) for diagram number 161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 162 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 162
-      // (none)
-
-      // Amplitude(s) for diagram number 162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 163 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 163
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
-
-      // Amplitude(s) for diagram number 163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 164 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 164
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] );
-
-      // Amplitude(s) for diagram number 164
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 165 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 165
-      // (none)
-
-      // Amplitude(s) for diagram number 165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 166 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 166
-      // (none)
-
-      // Amplitude(s) for diagram number 166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 167 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 167
-      // (none)
-
-      // Amplitude(s) for diagram number 167
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 168 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 168
-      // (none)
-
-      // Amplitude(s) for diagram number 168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 169 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 169
-      // (none)
-
-      // Amplitude(s) for diagram number 169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-
-      // *** DIAGRAM 170 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 170
-      // (none)
-
-      // Amplitude(s) for diagram number 170
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 171 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 171
-      // (none)
-
-      // Amplitude(s) for diagram number 171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-
-      // *** DIAGRAM 172 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 172
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] );
-
-      // Amplitude(s) for diagram number 172
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 173 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] );
-
-      // Amplitude(s) for diagram number 173
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 174 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 174
-      // (none)
-
-      // Amplitude(s) for diagram number 174
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 175 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 175
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] );
-
-      // Amplitude(s) for diagram number 175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 176 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 176
-      // (none)
-
-      // Amplitude(s) for diagram number 176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 177 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 177
-      // (none)
-
-      // Amplitude(s) for diagram number 177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 178 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 178
-      // (none)
-
-      // Amplitude(s) for diagram number 178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 179 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 179
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-
-      // Amplitude(s) for diagram number 179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 180 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 180
-      // (none)
-
-      // Amplitude(s) for diagram number 180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 181 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 181
-      // (none)
-
-      // Amplitude(s) for diagram number 181
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 182 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 182
-      // (none)
-
-      // Amplitude(s) for diagram number 182
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 183 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 183
-      // (none)
-
-      // Amplitude(s) for diagram number 183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 184 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 184
-      // (none)
-
-      // Amplitude(s) for diagram number 184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 185 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 185
-      // (none)
-
-      // Amplitude(s) for diagram number 185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-
-      // *** DIAGRAM 186 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 186
-      // (none)
-
-      // Amplitude(s) for diagram number 186
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 187 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 187
-      // (none)
-
-      // Amplitude(s) for diagram number 187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-
-      // *** DIAGRAM 188 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 188
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-
-      // Amplitude(s) for diagram number 188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 189 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 189
-      // (none)
-
-      // Amplitude(s) for diagram number 189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 190 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 190
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] );
-
-      // Amplitude(s) for diagram number 190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 191 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 191
-      // (none)
-
-      // Amplitude(s) for diagram number 191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-
-      // *** DIAGRAM 192 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 192
-      // (none)
-
-      // Amplitude(s) for diagram number 192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 193 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 193
-      // (none)
-
-      // Amplitude(s) for diagram number 193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 194 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 194
-      // (none)
-
-      // Amplitude(s) for diagram number 194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 195 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 195
-      // (none)
-
-      // Amplitude(s) for diagram number 195
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 196 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 196
-      // (none)
-
-      // Amplitude(s) for diagram number 196
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 197 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 197
-      // (none)
-
-      // Amplitude(s) for diagram number 197
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 198 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 198
-      // (none)
-
-      // Amplitude(s) for diagram number 198
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 199 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 199
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
-
-      // Amplitude(s) for diagram number 199
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 200 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 200
-      // (none)
-
-      // Amplitude(s) for diagram number 200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-
-      // *** DIAGRAM 201 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 201
-      // (none)
-
-      // Amplitude(s) for diagram number 201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 202 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 202
-      // (none)
-
-      // Amplitude(s) for diagram number 202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 203 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 203
-      // (none)
-
-      // Amplitude(s) for diagram number 203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 204 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 204
-      // (none)
-
-      // Amplitude(s) for diagram number 204
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 205 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 205
-      // (none)
-
-      // Amplitude(s) for diagram number 205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 206 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 206
-      // (none)
-
-      // Amplitude(s) for diagram number 206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 207 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 207
-      // (none)
-
-      // Amplitude(s) for diagram number 207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 208 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 208
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-
-      // Amplitude(s) for diagram number 208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-
-      // *** DIAGRAM 209 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 209
-      // (none)
-
-      // Amplitude(s) for diagram number 209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-
-      // *** DIAGRAM 210 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 210
-      // (none)
-
-      // Amplitude(s) for diagram number 210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 211 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 211
-      // (none)
-
-      // Amplitude(s) for diagram number 211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 212 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 212
-      // (none)
-
-      // Amplitude(s) for diagram number 212
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 213 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 213
-      // (none)
-
-      // Amplitude(s) for diagram number 213
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 214 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 214
-      // (none)
-
-      // Amplitude(s) for diagram number 214
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 215 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 215
-      // (none)
-
-      // Amplitude(s) for diagram number 215
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 216 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 216
-      // (none)
-
-      // Amplitude(s) for diagram number 216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 217 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 217
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] );
-
-      // Amplitude(s) for diagram number 217
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 218 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 218
-      // (none)
-
-      // Amplitude(s) for diagram number 218
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 219 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 219
-      // (none)
-
-      // Amplitude(s) for diagram number 219
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 220 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 220
-      // (none)
-
-      // Amplitude(s) for diagram number 220
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 221 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 221
-      // (none)
-
-      // Amplitude(s) for diagram number 221
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 222 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 222
-      // (none)
-
-      // Amplitude(s) for diagram number 222
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 223 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 223
-      // (none)
-
-      // Amplitude(s) for diagram number 223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 224 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 224
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 224
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 225 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 225
-      // (none)
-
-      // Amplitude(s) for diagram number 225
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 226 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 226
-      // (none)
-
-      // Amplitude(s) for diagram number 226
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 227 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 227
-      // (none)
-
-      // Amplitude(s) for diagram number 227
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 228 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 228
-      // (none)
-
-      // Amplitude(s) for diagram number 228
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 229 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 229
-      // (none)
-
-      // Amplitude(s) for diagram number 229
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 230 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 230
-      // (none)
-
-      // Amplitude(s) for diagram number 230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 231 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 231
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] );
-
-      // Amplitude(s) for diagram number 231
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 232 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 232
-      // (none)
-
-      // Amplitude(s) for diagram number 232
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 233 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 233
-      // (none)
-
-      // Amplitude(s) for diagram number 233
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 234 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 234
-      // (none)
-
-      // Amplitude(s) for diagram number 234
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 235 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 235
-      // (none)
-
-      // Amplitude(s) for diagram number 235
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 236 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 236
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] );
-
-      // Amplitude(s) for diagram number 236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 237 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 237
-      // (none)
-
-      // Amplitude(s) for diagram number 237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 238 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 238
-      // (none)
-
-      // Amplitude(s) for diagram number 238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 239 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 239
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] );
-
-      // Amplitude(s) for diagram number 239
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 240 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 240
-      // (none)
-
-      // Amplitude(s) for diagram number 240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 241 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 241
-      // (none)
-
-      // Amplitude(s) for diagram number 241
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 242 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 242
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] );
-
-      // Amplitude(s) for diagram number 242
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 243 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 243
-      // (none)
-
-      // Amplitude(s) for diagram number 243
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 244 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 244
-      // (none)
-
-      // Amplitude(s) for diagram number 244
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 245 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 245
-      // (none)
-
-      // Amplitude(s) for diagram number 245
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 246 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 246
-      // (none)
-
-      // Amplitude(s) for diagram number 246
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 247 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 247
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 247
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] -= amp_sv[0];
-
-      // *** DIAGRAM 248 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 248
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] );
-
-      // Amplitude(s) for diagram number 248
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[47] -= amp_sv[0];
-
-      // *** DIAGRAM 249 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 249
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] );
-
-      // Amplitude(s) for diagram number 249
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= amp_sv[0];
-
-      // *** DIAGRAM 250 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 250
-      // (none)
-
-      // Amplitude(s) for diagram number 250
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] -= amp_sv[0];
-
-      // *** DIAGRAM 251 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 251
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
-
-      // Amplitude(s) for diagram number 251
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= amp_sv[0];
-
-      // *** DIAGRAM 252 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 252
-      // (none)
-
-      // Amplitude(s) for diagram number 252
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] -= amp_sv[0];
-
-      // *** DIAGRAM 253 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 253
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] );
-
-      // Amplitude(s) for diagram number 253
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 254 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 254
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
-
-      // Amplitude(s) for diagram number 254
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 255 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 255
-      // (none)
-
-      // Amplitude(s) for diagram number 255
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 256 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 256
-      // (none)
-
-      // Amplitude(s) for diagram number 256
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-
-      // *** DIAGRAM 257 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 257
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
-
-      // Amplitude(s) for diagram number 257
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 258 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 258
-      // (none)
-
-      // Amplitude(s) for diagram number 258
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 259 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 259
-      // (none)
-
-      // Amplitude(s) for diagram number 259
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 260 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 260
-      // (none)
-
-      // Amplitude(s) for diagram number 260
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 261 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 261
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
-
-      // Amplitude(s) for diagram number 261
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 262 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 262
-      // (none)
-
-      // Amplitude(s) for diagram number 262
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-
-      // *** DIAGRAM 263 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 263
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] );
-
-      // Amplitude(s) for diagram number 263
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 264 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 264
-      // (none)
-
-      // Amplitude(s) for diagram number 264
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 265 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 265
-      // (none)
-
-      // Amplitude(s) for diagram number 265
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 266 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 266
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] );
-
-      // Amplitude(s) for diagram number 266
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 267 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 267
-      // (none)
-
-      // Amplitude(s) for diagram number 267
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 268 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 268
-      // (none)
-
-      // Amplitude(s) for diagram number 268
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 269 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 269
-      // (none)
-
-      // Amplitude(s) for diagram number 269
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 270 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 270
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
-
-      // Amplitude(s) for diagram number 270
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 271 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 271
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] );
-
-      // Amplitude(s) for diagram number 271
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 272 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 272
-      // (none)
-
-      // Amplitude(s) for diagram number 272
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 273 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 273
-      // (none)
-
-      // Amplitude(s) for diagram number 273
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 274 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 274
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] );
-
-      // Amplitude(s) for diagram number 274
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 275 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 275
-      // (none)
-
-      // Amplitude(s) for diagram number 275
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 276 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 276
-      // (none)
-
-      // Amplitude(s) for diagram number 276
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 277 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 277
-      // (none)
-
-      // Amplitude(s) for diagram number 277
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 278 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 278
-      // (none)
-
-      // Amplitude(s) for diagram number 278
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 279 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 279
-      // (none)
-
-      // Amplitude(s) for diagram number 279
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 280 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 280
-      // (none)
-
-      // Amplitude(s) for diagram number 280
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 281 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 281
-      // (none)
-
-      // Amplitude(s) for diagram number 281
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 282 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 282
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
-
-      // Amplitude(s) for diagram number 282
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 283 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 283
-      // (none)
-
-      // Amplitude(s) for diagram number 283
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 284 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 284
-      // (none)
-
-      // Amplitude(s) for diagram number 284
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 285 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 285
-      // (none)
-
-      // Amplitude(s) for diagram number 285
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-
-      // *** DIAGRAM 286 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 286
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
-
-      // Amplitude(s) for diagram number 286
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 287 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 287
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] );
-
-      // Amplitude(s) for diagram number 287
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 288 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 288
-      // (none)
-
-      // Amplitude(s) for diagram number 288
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 289 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 289
-      // (none)
-
-      // Amplitude(s) for diagram number 289
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 290 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 290
-      // (none)
-
-      // Amplitude(s) for diagram number 290
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 291 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 291
-      // (none)
-
-      // Amplitude(s) for diagram number 291
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 292 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 292
-      // (none)
-
-      // Amplitude(s) for diagram number 292
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 293 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 293
-      // (none)
-
-      // Amplitude(s) for diagram number 293
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 294 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 294
-      // (none)
-
-      // Amplitude(s) for diagram number 294
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 295 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 295
-      // (none)
-
-      // Amplitude(s) for diagram number 295
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 296 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 296
-      // (none)
-
-      // Amplitude(s) for diagram number 296
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 297 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 297
-      // (none)
-
-      // Amplitude(s) for diagram number 297
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 298 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 298
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
-
-      // Amplitude(s) for diagram number 298
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 299 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 299
-      // (none)
-
-      // Amplitude(s) for diagram number 299
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 300 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 300
-      // (none)
-
-      // Amplitude(s) for diagram number 300
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 301 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 301
-      // (none)
-
-      // Amplitude(s) for diagram number 301
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-
-      // *** DIAGRAM 302 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 302
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 302
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 303 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 303
-      // (none)
-
-      // Amplitude(s) for diagram number 303
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-
-      // *** DIAGRAM 304 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 304
-      // (none)
-
-      // Amplitude(s) for diagram number 304
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 305 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 305
-      // (none)
-
-      // Amplitude(s) for diagram number 305
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 306 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 306
-      // (none)
-
-      // Amplitude(s) for diagram number 306
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-
-      // *** DIAGRAM 307 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 307
-      // (none)
-
-      // Amplitude(s) for diagram number 307
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 308 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 308
-      // (none)
-
-      // Amplitude(s) for diagram number 308
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 309 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 309
-      // (none)
-
-      // Amplitude(s) for diagram number 309
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 310 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 310
-      // (none)
-
-      // Amplitude(s) for diagram number 310
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 311 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 311
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 311
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[65] -= amp_sv[0];
-
-      // *** DIAGRAM 312 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 312
-      // (none)
-
-      // Amplitude(s) for diagram number 312
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[71] -= amp_sv[0];
-
-      // *** DIAGRAM 313 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 313
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] );
-
-      // Amplitude(s) for diagram number 313
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[59] -= amp_sv[0];
-
-      // *** DIAGRAM 314 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 314
-      // (none)
-
-      // Amplitude(s) for diagram number 314
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[69] -= amp_sv[0];
-
-      // *** DIAGRAM 315 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 315
-      // (none)
-
-      // Amplitude(s) for diagram number 315
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[57] -= amp_sv[0];
-
-      // *** DIAGRAM 316 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 316
-      // (none)
-
-      // Amplitude(s) for diagram number 316
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[63] -= amp_sv[0];
-
-      // *** DIAGRAM 317 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 317
-      // (none)
-
-      // Amplitude(s) for diagram number 317
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 318 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 318
-      // (none)
-
-      // Amplitude(s) for diagram number 318
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 319 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 319
-      // (none)
-
-      // Amplitude(s) for diagram number 319
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 320 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 320
-      // (none)
-
-      // Amplitude(s) for diagram number 320
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[89] -= amp_sv[0];
-
-      // *** DIAGRAM 321 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 321
-      // (none)
-
-      // Amplitude(s) for diagram number 321
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[95] -= amp_sv[0];
-
-      // *** DIAGRAM 322 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 322
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
-
-      // Amplitude(s) for diagram number 322
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[83] -= amp_sv[0];
-
-      // *** DIAGRAM 323 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 323
-      // (none)
-
-      // Amplitude(s) for diagram number 323
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[93] -= amp_sv[0];
-
-      // *** DIAGRAM 324 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 324
-      // (none)
-
-      // Amplitude(s) for diagram number 324
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[81] -= amp_sv[0];
-
-      // *** DIAGRAM 325 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 325
-      // (none)
-
-      // Amplitude(s) for diagram number 325
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[87] -= amp_sv[0];
-
-      // *** DIAGRAM 326 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 326
-      // (none)
-
-      // Amplitude(s) for diagram number 326
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 327 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 327
-      // (none)
-
-      // Amplitude(s) for diagram number 327
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 328 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 328
-      // (none)
-
-      // Amplitude(s) for diagram number 328
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 329 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 329
-      // (none)
-
-      // Amplitude(s) for diagram number 329
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 330 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 330
-      // (none)
-
-      // Amplitude(s) for diagram number 330
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 331 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 331
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
-
-      // Amplitude(s) for diagram number 331
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 332 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 332
-      // (none)
-
-      // Amplitude(s) for diagram number 332
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 333 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 333
-      // (none)
-
-      // Amplitude(s) for diagram number 333
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[105] -= amp_sv[0];
-
-      // *** DIAGRAM 334 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 334
-      // (none)
-
-      // Amplitude(s) for diagram number 334
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 335 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 335
-      // (none)
-
-      // Amplitude(s) for diagram number 335
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 336 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 336
-      // (none)
-
-      // Amplitude(s) for diagram number 336
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 337 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 337
-      // (none)
-
-      // Amplitude(s) for diagram number 337
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 338 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 338
-      // (none)
-
-      // Amplitude(s) for diagram number 338
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 339 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 339
-      // (none)
-
-      // Amplitude(s) for diagram number 339
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 340 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 340
-      // (none)
-
-      // Amplitude(s) for diagram number 340
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 341 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 341
-      // (none)
-
-      // Amplitude(s) for diagram number 341
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 342 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 342
-      // (none)
-
-      // Amplitude(s) for diagram number 342
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 343 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 343
-      // (none)
-
-      // Amplitude(s) for diagram number 343
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 344 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 344
-      // (none)
-
-      // Amplitude(s) for diagram number 344
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 345 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 345
-      // (none)
-
-      // Amplitude(s) for diagram number 345
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 346 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 346
-      // (none)
-
-      // Amplitude(s) for diagram number 346
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 347 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 347
-      // (none)
-
-      // Amplitude(s) for diagram number 347
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 348 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 348
-      // (none)
-
-      // Amplitude(s) for diagram number 348
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 349 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 349
-      // (none)
-
-      // Amplitude(s) for diagram number 349
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 350 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 350
-      // (none)
-
-      // Amplitude(s) for diagram number 350
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 351 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 351
-      // (none)
-
-      // Amplitude(s) for diagram number 351
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 352 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 352
-      // (none)
-
-      // Amplitude(s) for diagram number 352
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 353 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 353
-      // (none)
-
-      // Amplitude(s) for diagram number 353
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 354 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 354
-      // (none)
-
-      // Amplitude(s) for diagram number 354
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 355 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 355
-      // (none)
-
-      // Amplitude(s) for diagram number 355
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 356 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 356
-      // (none)
-
-      // Amplitude(s) for diagram number 356
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 357 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 357
-      // (none)
-
-      // Amplitude(s) for diagram number 357
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 358 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 358
-      // (none)
-
-      // Amplitude(s) for diagram number 358
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 359 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 359
-      // (none)
-
-      // Amplitude(s) for diagram number 359
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 360 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 360
-      // (none)
-
-      // Amplitude(s) for diagram number 360
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-
-      // *** DIAGRAM 361 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 361
-      // (none)
-
-      // Amplitude(s) for diagram number 361
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 362 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 362
-      // (none)
-
-      // Amplitude(s) for diagram number 362
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 363 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 363
-      // (none)
-
-      // Amplitude(s) for diagram number 363
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 364 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 364
-      // (none)
-
-      // Amplitude(s) for diagram number 364
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-
-      // *** DIAGRAM 365 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 365
-      // (none)
-
-      // Amplitude(s) for diagram number 365
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 366 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 366
-      // (none)
-
-      // Amplitude(s) for diagram number 366
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 367 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 367
-      // (none)
-
-      // Amplitude(s) for diagram number 367
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-
-      // *** DIAGRAM 368 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 368
-      // (none)
-
-      // Amplitude(s) for diagram number 368
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 369 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 369
-      // (none)
-
-      // Amplitude(s) for diagram number 369
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 370 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 370
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 370
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 371 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 371
-      // (none)
-
-      // Amplitude(s) for diagram number 371
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 372 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 372
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] );
-
-      // Amplitude(s) for diagram number 372
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 373 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 373
-      // (none)
-
-      // Amplitude(s) for diagram number 373
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 374 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 374
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 374
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 375 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 375
-      // (none)
-
-      // Amplitude(s) for diagram number 375
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-
-      // *** DIAGRAM 376 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 376
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
-
-      // Amplitude(s) for diagram number 376
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 377 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 377
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] );
-
-      // Amplitude(s) for diagram number 377
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 378 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 378
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 378
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 379 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 379
-      // (none)
-
-      // Amplitude(s) for diagram number 379
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-
-      // *** DIAGRAM 380 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 380
-      // (none)
-
-      // Amplitude(s) for diagram number 380
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 381 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 381
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] );
-
-      // Amplitude(s) for diagram number 381
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 382 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 382
-      // (none)
-
-      // Amplitude(s) for diagram number 382
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-
-      // *** DIAGRAM 383 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 383
-      // (none)
-
-      // Amplitude(s) for diagram number 383
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-
-      // *** DIAGRAM 384 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 384
-      // (none)
-
-      // Amplitude(s) for diagram number 384
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 385 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 385
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] );
-
-      // Amplitude(s) for diagram number 385
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 386 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 386
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
-
-      // Amplitude(s) for diagram number 386
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 387 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 387
-      // (none)
-
-      // Amplitude(s) for diagram number 387
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 388 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 388
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] );
-
-      // Amplitude(s) for diagram number 388
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 389 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 389
-      // (none)
-
-      // Amplitude(s) for diagram number 389
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 390 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 390
-      // (none)
-
-      // Amplitude(s) for diagram number 390
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 391 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 391
-      // (none)
-
-      // Amplitude(s) for diagram number 391
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 392 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 392
-      // (none)
-
-      // Amplitude(s) for diagram number 392
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 393 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 393
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
-
-      // Amplitude(s) for diagram number 393
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 394 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 394
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] );
-
-      // Amplitude(s) for diagram number 394
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 395 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 395
-      // (none)
-
-      // Amplitude(s) for diagram number 395
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-
-      // *** DIAGRAM 396 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 396
-      // (none)
-
-      // Amplitude(s) for diagram number 396
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 397 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 397
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
-
-      // Amplitude(s) for diagram number 397
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 398 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 398
-      // (none)
-
-      // Amplitude(s) for diagram number 398
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 399 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 399
-      // (none)
-
-      // Amplitude(s) for diagram number 399
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 400 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 400
-      // (none)
-
-      // Amplitude(s) for diagram number 400
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 401 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 401
-      // (none)
-
-      // Amplitude(s) for diagram number 401
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 402 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 402
-      // (none)
-
-      // Amplitude(s) for diagram number 402
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 403 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 403
-      // (none)
-
-      // Amplitude(s) for diagram number 403
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 404 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 404
-      // (none)
-
-      // Amplitude(s) for diagram number 404
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 405 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 405
-      // (none)
-
-      // Amplitude(s) for diagram number 405
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 406 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 406
-      // (none)
-
-      // Amplitude(s) for diagram number 406
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 407 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 407
-      // (none)
-
-      // Amplitude(s) for diagram number 407
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 408 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 408
-      // (none)
-
-      // Amplitude(s) for diagram number 408
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 409 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 409
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 409
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 410 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 410
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] );
-
-      // Amplitude(s) for diagram number 410
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 411 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 411
-      // (none)
-
-      // Amplitude(s) for diagram number 411
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 412 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 412
-      // (none)
-
-      // Amplitude(s) for diagram number 412
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 413 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 413
-      // (none)
-
-      // Amplitude(s) for diagram number 413
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 414 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 414
-      // (none)
-
-      // Amplitude(s) for diagram number 414
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 415 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 415
-      // (none)
-
-      // Amplitude(s) for diagram number 415
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 416 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 416
-      // (none)
-
-      // Amplitude(s) for diagram number 416
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-
-      // *** DIAGRAM 417 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 417
-      // (none)
-
-      // Amplitude(s) for diagram number 417
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-
-      // *** DIAGRAM 418 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 418
-      // (none)
-
-      // Amplitude(s) for diagram number 418
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-
-      // *** DIAGRAM 419 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 419
-      // (none)
-
-      // Amplitude(s) for diagram number 419
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 420 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 420
-      // (none)
-
-      // Amplitude(s) for diagram number 420
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 421 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 421
-      // (none)
-
-      // Amplitude(s) for diagram number 421
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 422 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 422
-      // (none)
-
-      // Amplitude(s) for diagram number 422
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 423 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 423
-      // (none)
-
-      // Amplitude(s) for diagram number 423
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 424 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 424
-      // (none)
-
-      // Amplitude(s) for diagram number 424
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 425 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 425
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 425
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-
-      // *** DIAGRAM 426 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 426
-      // (none)
-
-      // Amplitude(s) for diagram number 426
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 427 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 427
-      // (none)
-
-      // Amplitude(s) for diagram number 427
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 428 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 428
-      // (none)
-
-      // Amplitude(s) for diagram number 428
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 429 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 429
-      // (none)
-
-      // Amplitude(s) for diagram number 429
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 430 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 430
-      // (none)
-
-      // Amplitude(s) for diagram number 430
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-
-      // *** DIAGRAM 431 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 431
-      // (none)
-
-      // Amplitude(s) for diagram number 431
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 432 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 432
-      // (none)
-
-      // Amplitude(s) for diagram number 432
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-
-      // *** DIAGRAM 433 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 433
-      // (none)
-
-      // Amplitude(s) for diagram number 433
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-
-      // *** DIAGRAM 434 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 434
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 434
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 435 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 435
-      // (none)
-
-      // Amplitude(s) for diagram number 435
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 436 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 436
-      // (none)
-
-      // Amplitude(s) for diagram number 436
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 437 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 437
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] );
-
-      // Amplitude(s) for diagram number 437
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 438 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 438
-      // (none)
-
-      // Amplitude(s) for diagram number 438
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 439 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 439
-      // (none)
-
-      // Amplitude(s) for diagram number 439
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 440 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 440
-      // (none)
-
-      // Amplitude(s) for diagram number 440
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 441 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 441
-      // (none)
-
-      // Amplitude(s) for diagram number 441
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 442 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 442
-      // (none)
-
-      // Amplitude(s) for diagram number 442
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 443 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 443
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 443
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 444 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 444
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] );
-
-      // Amplitude(s) for diagram number 444
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 445 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 445
-      // (none)
-
-      // Amplitude(s) for diagram number 445
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 446 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 446
-      // (none)
-
-      // Amplitude(s) for diagram number 446
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 447 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 447
-      // (none)
-
-      // Amplitude(s) for diagram number 447
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 448 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 448
-      // (none)
-
-      // Amplitude(s) for diagram number 448
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 449 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 449
-      // (none)
-
-      // Amplitude(s) for diagram number 449
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 450 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 450
-      // (none)
-
-      // Amplitude(s) for diagram number 450
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 451 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 451
-      // (none)
-
-      // Amplitude(s) for diagram number 451
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-
-      // *** DIAGRAM 452 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 452
-      // (none)
-
-      // Amplitude(s) for diagram number 452
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 453 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 453
-      // (none)
-
-      // Amplitude(s) for diagram number 453
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 454 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 454
-      // (none)
-
-      // Amplitude(s) for diagram number 454
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 455 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 455
-      // (none)
-
-      // Amplitude(s) for diagram number 455
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 456 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 456
-      // (none)
-
-      // Amplitude(s) for diagram number 456
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 457 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 457
-      // (none)
-
-      // Amplitude(s) for diagram number 457
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-
-      // *** DIAGRAM 458 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 458
-      // (none)
-
-      // Amplitude(s) for diagram number 458
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 459 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 459
-      // (none)
-
-      // Amplitude(s) for diagram number 459
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 460 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 460
-      // (none)
-
-      // Amplitude(s) for diagram number 460
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 461 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 461
-      // (none)
-
-      // Amplitude(s) for diagram number 461
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 462 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 462
-      // (none)
-
-      // Amplitude(s) for diagram number 462
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 463 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 463
-      // (none)
-
-      // Amplitude(s) for diagram number 463
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 464 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 464
-      // (none)
-
-      // Amplitude(s) for diagram number 464
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 465 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 465
-      // (none)
-
-      // Amplitude(s) for diagram number 465
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 466 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 466
-      // (none)
-
-      // Amplitude(s) for diagram number 466
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 467 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 467
-      // (none)
-
-      // Amplitude(s) for diagram number 467
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 468 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 468
-      // (none)
-
-      // Amplitude(s) for diagram number 468
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 469 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 469
-      // (none)
-
-      // Amplitude(s) for diagram number 469
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 470 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 470
-      // (none)
-
-      // Amplitude(s) for diagram number 470
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 471 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 471
-      // (none)
-
-      // Amplitude(s) for diagram number 471
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-
-      // *** DIAGRAM 472 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 472
-      // (none)
-
-      // Amplitude(s) for diagram number 472
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 473 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 473
-      // (none)
-
-      // Amplitude(s) for diagram number 473
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 474 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 474
-      // (none)
-
-      // Amplitude(s) for diagram number 474
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 475 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 475
-      // (none)
-
-      // Amplitude(s) for diagram number 475
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 476 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 476
-      // (none)
-
-      // Amplitude(s) for diagram number 476
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 477 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 477
-      // (none)
-
-      // Amplitude(s) for diagram number 477
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 478 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 478
-      // (none)
-
-      // Amplitude(s) for diagram number 478
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-
-      // *** DIAGRAM 479 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 479
-      // (none)
-
-      // Amplitude(s) for diagram number 479
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 480 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 480
-      // (none)
-
-      // Amplitude(s) for diagram number 480
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 481 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 481
-      // (none)
-
-      // Amplitude(s) for diagram number 481
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-
-      // *** DIAGRAM 482 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 482
-      // (none)
-
-      // Amplitude(s) for diagram number 482
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 483 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 483
-      // (none)
-
-      // Amplitude(s) for diagram number 483
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 484 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 484
-      // (none)
-
-      // Amplitude(s) for diagram number 484
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 485 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 485
-      // (none)
-
-      // Amplitude(s) for diagram number 485
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 486 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 486
-      // (none)
-
-      // Amplitude(s) for diagram number 486
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 487 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 487
-      // (none)
-
-      // Amplitude(s) for diagram number 487
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-
-      // *** DIAGRAM 488 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 488
-      // (none)
-
-      // Amplitude(s) for diagram number 488
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 489 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 489
-      // (none)
-
-      // Amplitude(s) for diagram number 489
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 490 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 490
-      // (none)
-
-      // Amplitude(s) for diagram number 490
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 491 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 491
-      // (none)
-
-      // Amplitude(s) for diagram number 491
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 492 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 492
-      // (none)
-
-      // Amplitude(s) for diagram number 492
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 493 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 493
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 493
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 494 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 494
-      // (none)
-
-      // Amplitude(s) for diagram number 494
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 495 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 495
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] );
-
-      // Amplitude(s) for diagram number 495
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 496 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 496
-      // (none)
-
-      // Amplitude(s) for diagram number 496
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-
-      // *** DIAGRAM 497 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 497
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 497
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 498 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 498
-      // (none)
-
-      // Amplitude(s) for diagram number 498
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-
-      // *** DIAGRAM 499 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 499
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] );
-
-      // Amplitude(s) for diagram number 499
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 500 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 500
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
-
-      // Amplitude(s) for diagram number 500
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 501 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 501
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
-
-      // Amplitude(s) for diagram number 501
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 502 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 502
-      // (none)
-
-      // Amplitude(s) for diagram number 502
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-
-      // *** DIAGRAM 503 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 503
-      // (none)
-
-      // Amplitude(s) for diagram number 503
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 504 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 504
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
-
-      // Amplitude(s) for diagram number 504
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 505 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 505
-      // (none)
-
-      // Amplitude(s) for diagram number 505
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-
-      // *** DIAGRAM 506 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 506
-      // (none)
-
-      // Amplitude(s) for diagram number 506
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-
-      // *** DIAGRAM 507 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 507
-      // (none)
-
-      // Amplitude(s) for diagram number 507
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-
-      // *** DIAGRAM 508 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 508
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] );
-
-      // Amplitude(s) for diagram number 508
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 509 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 509
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] );
-
-      // Amplitude(s) for diagram number 509
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 510 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 510
-      // (none)
-
-      // Amplitude(s) for diagram number 510
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 511 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 511
-      // (none)
-
-      // Amplitude(s) for diagram number 511
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 512 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 512
-      // (none)
-
-      // Amplitude(s) for diagram number 512
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-
-      // *** DIAGRAM 513 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 513
-      // (none)
-
-      // Amplitude(s) for diagram number 513
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 514 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 514
-      // (none)
-
-      // Amplitude(s) for diagram number 514
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 515 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 515
-      // (none)
-
-      // Amplitude(s) for diagram number 515
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 516 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 516
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
-
-      // Amplitude(s) for diagram number 516
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 517 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 517
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 517
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 518 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 518
-      // (none)
-
-      // Amplitude(s) for diagram number 518
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-
-      // *** DIAGRAM 519 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 519
-      // (none)
-
-      // Amplitude(s) for diagram number 519
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 520 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 520
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
-
-      // Amplitude(s) for diagram number 520
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 521 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 521
-      // (none)
-
-      // Amplitude(s) for diagram number 521
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 522 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 522
-      // (none)
-
-      // Amplitude(s) for diagram number 522
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 523 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 523
-      // (none)
-
-      // Amplitude(s) for diagram number 523
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 524 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 524
-      // (none)
-
-      // Amplitude(s) for diagram number 524
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 525 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 525
-      // (none)
-
-      // Amplitude(s) for diagram number 525
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 526 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 526
-      // (none)
-
-      // Amplitude(s) for diagram number 526
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 527 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 527
-      // (none)
-
-      // Amplitude(s) for diagram number 527
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 528 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 528
-      // (none)
-
-      // Amplitude(s) for diagram number 528
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 529 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 529
-      // (none)
-
-      // Amplitude(s) for diagram number 529
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 530 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 530
-      // (none)
-
-      // Amplitude(s) for diagram number 530
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 531 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 531
-      // (none)
-
-      // Amplitude(s) for diagram number 531
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 532 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 532
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 532
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 533 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 533
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] );
-
-      // Amplitude(s) for diagram number 533
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 534 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 534
-      // (none)
-
-      // Amplitude(s) for diagram number 534
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 535 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 535
-      // (none)
-
-      // Amplitude(s) for diagram number 535
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 536 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 536
-      // (none)
-
-      // Amplitude(s) for diagram number 536
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 537 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 537
-      // (none)
-
-      // Amplitude(s) for diagram number 537
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 538 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 538
-      // (none)
-
-      // Amplitude(s) for diagram number 538
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 539 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 539
-      // (none)
-
-      // Amplitude(s) for diagram number 539
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-
-      // *** DIAGRAM 540 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 540
-      // (none)
-
-      // Amplitude(s) for diagram number 540
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-
-      // *** DIAGRAM 541 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 541
-      // (none)
-
-      // Amplitude(s) for diagram number 541
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-
-      // *** DIAGRAM 542 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 542
-      // (none)
-
-      // Amplitude(s) for diagram number 542
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 543 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 543
-      // (none)
-
-      // Amplitude(s) for diagram number 543
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-
-      // *** DIAGRAM 544 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 544
-      // (none)
-
-      // Amplitude(s) for diagram number 544
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 545 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 545
-      // (none)
-
-      // Amplitude(s) for diagram number 545
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 546 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 546
-      // (none)
-
-      // Amplitude(s) for diagram number 546
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 547 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 547
-      // (none)
-
-      // Amplitude(s) for diagram number 547
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 548 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 548
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 548
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 549 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 549
-      // (none)
-
-      // Amplitude(s) for diagram number 549
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-
-      // *** DIAGRAM 550 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 550
-      // (none)
-
-      // Amplitude(s) for diagram number 550
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 551 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 551
-      // (none)
-
-      // Amplitude(s) for diagram number 551
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 552 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 552
-      // (none)
-
-      // Amplitude(s) for diagram number 552
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-
-      // *** DIAGRAM 553 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 553
-      // (none)
-
-      // Amplitude(s) for diagram number 553
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-
-      // *** DIAGRAM 554 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 554
-      // (none)
-
-      // Amplitude(s) for diagram number 554
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 555 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 555
-      // (none)
-
-      // Amplitude(s) for diagram number 555
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-
-      // *** DIAGRAM 556 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 556
-      // (none)
-
-      // Amplitude(s) for diagram number 556
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 557 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 557
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 557
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 558 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 558
-      // (none)
-
-      // Amplitude(s) for diagram number 558
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 559 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 559
-      // (none)
-
-      // Amplitude(s) for diagram number 559
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 560 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 560
-      // (none)
-
-      // Amplitude(s) for diagram number 560
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 561 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 561
-      // (none)
-
-      // Amplitude(s) for diagram number 561
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 562 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 562
-      // (none)
-
-      // Amplitude(s) for diagram number 562
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 563 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 563
-      // (none)
-
-      // Amplitude(s) for diagram number 563
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 564 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 564
-      // (none)
-
-      // Amplitude(s) for diagram number 564
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 565 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 565
-      // (none)
-
-      // Amplitude(s) for diagram number 565
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 566 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 566
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
-
-      // Amplitude(s) for diagram number 566
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 567 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 567
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
-
-      // Amplitude(s) for diagram number 567
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 568 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 568
-      // (none)
-
-      // Amplitude(s) for diagram number 568
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 569 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 569
-      // (none)
-
-      // Amplitude(s) for diagram number 569
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 570 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 570
-      // (none)
-
-      // Amplitude(s) for diagram number 570
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 571 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 571
-      // (none)
-
-      // Amplitude(s) for diagram number 571
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 572 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 572
-      // (none)
-
-      // Amplitude(s) for diagram number 572
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 573 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 573
-      // (none)
-
-      // Amplitude(s) for diagram number 573
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 574 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 574
-      // (none)
-
-      // Amplitude(s) for diagram number 574
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-
-      // *** DIAGRAM 575 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 575
-      // (none)
-
-      // Amplitude(s) for diagram number 575
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 576 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 576
-      // (none)
-
-      // Amplitude(s) for diagram number 576
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 577 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 577
-      // (none)
-
-      // Amplitude(s) for diagram number 577
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 578 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 578
-      // (none)
-
-      // Amplitude(s) for diagram number 578
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 579 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 579
-      // (none)
-
-      // Amplitude(s) for diagram number 579
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 580 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 580
-      // (none)
-
-      // Amplitude(s) for diagram number 580
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-
-      // *** DIAGRAM 581 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 581
-      // (none)
-
-      // Amplitude(s) for diagram number 581
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 582 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 582
-      // (none)
-
-      // Amplitude(s) for diagram number 582
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 583 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 583
-      // (none)
-
-      // Amplitude(s) for diagram number 583
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 584 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 584
-      // (none)
-
-      // Amplitude(s) for diagram number 584
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 585 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 585
-      // (none)
-
-      // Amplitude(s) for diagram number 585
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 586 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 586
-      // (none)
-
-      // Amplitude(s) for diagram number 586
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 587 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 587
-      // (none)
-
-      // Amplitude(s) for diagram number 587
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 588 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 588
-      // (none)
-
-      // Amplitude(s) for diagram number 588
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 589 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 589
-      // (none)
-
-      // Amplitude(s) for diagram number 589
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 590 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 590
-      // (none)
-
-      // Amplitude(s) for diagram number 590
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 591 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 591
-      // (none)
-
-      // Amplitude(s) for diagram number 591
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 592 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 592
-      // (none)
-
-      // Amplitude(s) for diagram number 592
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 593 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 593
-      // (none)
-
-      // Amplitude(s) for diagram number 593
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 594 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 594
-      // (none)
-
-      // Amplitude(s) for diagram number 594
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-
-      // *** DIAGRAM 595 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 595
-      // (none)
-
-      // Amplitude(s) for diagram number 595
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 596 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 596
-      // (none)
-
-      // Amplitude(s) for diagram number 596
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 597 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 597
-      // (none)
-
-      // Amplitude(s) for diagram number 597
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 598 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 598
-      // (none)
-
-      // Amplitude(s) for diagram number 598
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 599 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 599
-      // (none)
-
-      // Amplitude(s) for diagram number 599
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 600 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 600
-      // (none)
-
-      // Amplitude(s) for diagram number 600
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 601 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 601
-      // (none)
-
-      // Amplitude(s) for diagram number 601
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-
-      // *** DIAGRAM 602 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 602
-      // (none)
-
-      // Amplitude(s) for diagram number 602
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 603 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 603
-      // (none)
-
-      // Amplitude(s) for diagram number 603
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 604 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 604
-      // (none)
-
-      // Amplitude(s) for diagram number 604
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-
-      // *** DIAGRAM 605 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 605
-      // (none)
-
-      // Amplitude(s) for diagram number 605
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 606 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 606
-      // (none)
-
-      // Amplitude(s) for diagram number 606
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 607 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 607
-      // (none)
-
-      // Amplitude(s) for diagram number 607
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 608 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 608
-      // (none)
-
-      // Amplitude(s) for diagram number 608
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 609 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 609
-      // (none)
-
-      // Amplitude(s) for diagram number 609
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 610 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 610
-      // (none)
-
-      // Amplitude(s) for diagram number 610
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-
-      // *** DIAGRAM 611 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 611
-      // (none)
-
-      // Amplitude(s) for diagram number 611
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 612 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 612
-      // (none)
-
-      // Amplitude(s) for diagram number 612
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 613 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 613
-      // (none)
-
-      // Amplitude(s) for diagram number 613
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 614 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 614
-      // (none)
-
-      // Amplitude(s) for diagram number 614
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 615 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 615
-      // (none)
-
-      // Amplitude(s) for diagram number 615
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 616 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 616
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 616
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 617 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 617
-      // (none)
-
-      // Amplitude(s) for diagram number 617
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 618 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 618
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] );
-
-      // Amplitude(s) for diagram number 618
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 619 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 619
-      // (none)
-
-      // Amplitude(s) for diagram number 619
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-
-      // *** DIAGRAM 620 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 620
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 620
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 621 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 621
-      // (none)
-
-      // Amplitude(s) for diagram number 621
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-
-      // *** DIAGRAM 622 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 622
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] );
-
-      // Amplitude(s) for diagram number 622
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 623 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 623
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
-
-      // Amplitude(s) for diagram number 623
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 624 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 624
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
-
-      // Amplitude(s) for diagram number 624
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 625 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 625
-      // (none)
-
-      // Amplitude(s) for diagram number 625
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-
-      // *** DIAGRAM 626 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 626
-      // (none)
-
-      // Amplitude(s) for diagram number 626
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 627 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 627
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
-
-      // Amplitude(s) for diagram number 627
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 628 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 628
-      // (none)
-
-      // Amplitude(s) for diagram number 628
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-
-      // *** DIAGRAM 629 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 629
-      // (none)
-
-      // Amplitude(s) for diagram number 629
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-
-      // *** DIAGRAM 630 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 630
-      // (none)
-
-      // Amplitude(s) for diagram number 630
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-
-      // *** DIAGRAM 631 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 631
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] );
-
-      // Amplitude(s) for diagram number 631
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 632 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 632
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] );
-
-      // Amplitude(s) for diagram number 632
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 633 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 633
-      // (none)
-
-      // Amplitude(s) for diagram number 633
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 634 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 634
-      // (none)
-
-      // Amplitude(s) for diagram number 634
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 635 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 635
-      // (none)
-
-      // Amplitude(s) for diagram number 635
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 636 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 636
-      // (none)
-
-      // Amplitude(s) for diagram number 636
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 637 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 637
-      // (none)
-
-      // Amplitude(s) for diagram number 637
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 638 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 638
-      // (none)
-
-      // Amplitude(s) for diagram number 638
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 639 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 639
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
-
-      // Amplitude(s) for diagram number 639
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 640 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 640
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
-
-      // Amplitude(s) for diagram number 640
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 641 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 641
-      // (none)
-
-      // Amplitude(s) for diagram number 641
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-
-      // *** DIAGRAM 642 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 642
-      // (none)
-
-      // Amplitude(s) for diagram number 642
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 643 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 643
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
-
-      // Amplitude(s) for diagram number 643
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 644 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 644
-      // (none)
-
-      // Amplitude(s) for diagram number 644
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-
-      // *** DIAGRAM 645 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 645
-      // (none)
-
-      // Amplitude(s) for diagram number 645
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-
-      // *** DIAGRAM 646 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 646
-      // (none)
-
-      // Amplitude(s) for diagram number 646
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 647 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 647
-      // (none)
-
-      // Amplitude(s) for diagram number 647
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 648 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 648
-      // (none)
-
-      // Amplitude(s) for diagram number 648
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 649 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 649
-      // (none)
-
-      // Amplitude(s) for diagram number 649
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 650 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 650
-      // (none)
-
-      // Amplitude(s) for diagram number 650
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-
-      // *** DIAGRAM 651 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 651
-      // (none)
-
-      // Amplitude(s) for diagram number 651
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 652 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 652
-      // (none)
-
-      // Amplitude(s) for diagram number 652
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 653 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 653
-      // (none)
-
-      // Amplitude(s) for diagram number 653
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 654 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 654
-      // (none)
-
-      // Amplitude(s) for diagram number 654
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 655 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 655
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 655
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-
-      // *** DIAGRAM 656 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 656
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] );
-
-      // Amplitude(s) for diagram number 656
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 657 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 657
-      // (none)
-
-      // Amplitude(s) for diagram number 657
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 658 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 658
-      // (none)
-
-      // Amplitude(s) for diagram number 658
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 659 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 659
-      // (none)
-
-      // Amplitude(s) for diagram number 659
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-
-      // *** DIAGRAM 660 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 660
-      // (none)
-
-      // Amplitude(s) for diagram number 660
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 661 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 661
-      // (none)
-
-      // Amplitude(s) for diagram number 661
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 662 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 662
-      // (none)
-
-      // Amplitude(s) for diagram number 662
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-
-      // *** DIAGRAM 663 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 663
-      // (none)
-
-      // Amplitude(s) for diagram number 663
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-
-      // *** DIAGRAM 664 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 664
-      // (none)
-
-      // Amplitude(s) for diagram number 664
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-
-      // *** DIAGRAM 665 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 665
-      // (none)
-
-      // Amplitude(s) for diagram number 665
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 666 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 666
-      // (none)
-
-      // Amplitude(s) for diagram number 666
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-
-      // *** DIAGRAM 667 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 667
-      // (none)
-
-      // Amplitude(s) for diagram number 667
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 668 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 668
-      // (none)
-
-      // Amplitude(s) for diagram number 668
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 669 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 669
-      // (none)
-
-      // Amplitude(s) for diagram number 669
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 670 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 670
-      // (none)
-
-      // Amplitude(s) for diagram number 670
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 671 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 671
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 671
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 672 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 672
-      // (none)
-
-      // Amplitude(s) for diagram number 672
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 673 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 673
-      // (none)
-
-      // Amplitude(s) for diagram number 673
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 674 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 674
-      // (none)
-
-      // Amplitude(s) for diagram number 674
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 675 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 675
-      // (none)
-
-      // Amplitude(s) for diagram number 675
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-
-      // *** DIAGRAM 676 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 676
-      // (none)
-
-      // Amplitude(s) for diagram number 676
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-
-      // *** DIAGRAM 677 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 677
-      // (none)
-
-      // Amplitude(s) for diagram number 677
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 678 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 678
-      // (none)
-
-      // Amplitude(s) for diagram number 678
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 679 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 679
-      // (none)
-
-      // Amplitude(s) for diagram number 679
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-
-      // *** DIAGRAM 680 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 680
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
-
-      // Amplitude(s) for diagram number 680
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 681 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 681
-      // (none)
-
-      // Amplitude(s) for diagram number 681
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-
-      // *** DIAGRAM 682 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 682
-      // (none)
-
-      // Amplitude(s) for diagram number 682
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-
-      // *** DIAGRAM 683 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 683
-      // (none)
-
-      // Amplitude(s) for diagram number 683
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 684 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 684
-      // (none)
-
-      // Amplitude(s) for diagram number 684
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-
-      // *** DIAGRAM 685 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 685
-      // (none)
-
-      // Amplitude(s) for diagram number 685
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-
-      // *** DIAGRAM 686 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 686
-      // (none)
-
-      // Amplitude(s) for diagram number 686
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 687 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 687
-      // (none)
-
-      // Amplitude(s) for diagram number 687
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 688 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 688
-      // (none)
-
-      // Amplitude(s) for diagram number 688
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-
-      // *** DIAGRAM 689 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 689
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] );
-
-      // Amplitude(s) for diagram number 689
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-
-      // *** DIAGRAM 690 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 690
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 690
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 691 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 691
-      // (none)
-
-      // Amplitude(s) for diagram number 691
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 692 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 692
-      // (none)
-
-      // Amplitude(s) for diagram number 692
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 693 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 693
-      // (none)
-
-      // Amplitude(s) for diagram number 693
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 694 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 694
-      // (none)
-
-      // Amplitude(s) for diagram number 694
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 695 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 695
-      // (none)
-
-      // Amplitude(s) for diagram number 695
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 696 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 696
-      // (none)
-
-      // Amplitude(s) for diagram number 696
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 697 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 697
-      // (none)
-
-      // Amplitude(s) for diagram number 697
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-
-      // *** DIAGRAM 698 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 698
-      // (none)
-
-      // Amplitude(s) for diagram number 698
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 699 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 699
-      // (none)
-
-      // Amplitude(s) for diagram number 699
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 700 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 700
-      // (none)
-
-      // Amplitude(s) for diagram number 700
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-
-      // *** DIAGRAM 701 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 701
-      // (none)
-
-      // Amplitude(s) for diagram number 701
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 702 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 702
-      // (none)
-
-      // Amplitude(s) for diagram number 702
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 703 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 703
-      // (none)
-
-      // Amplitude(s) for diagram number 703
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-
-      // *** DIAGRAM 704 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 704
-      // (none)
-
-      // Amplitude(s) for diagram number 704
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 705 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 705
-      // (none)
-
-      // Amplitude(s) for diagram number 705
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 706 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 706
-      // (none)
-
-      // Amplitude(s) for diagram number 706
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 707 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 707
-      // (none)
-
-      // Amplitude(s) for diagram number 707
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-
-      // *** DIAGRAM 708 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 708
-      // (none)
-
-      // Amplitude(s) for diagram number 708
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 709 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 709
-      // (none)
-
-      // Amplitude(s) for diagram number 709
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 710 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 710
-      // (none)
-
-      // Amplitude(s) for diagram number 710
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-
-      // *** DIAGRAM 711 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 711
-      // (none)
-
-      // Amplitude(s) for diagram number 711
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 712 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 712
-      // (none)
-
-      // Amplitude(s) for diagram number 712
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 713 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 713
-      // (none)
-
-      // Amplitude(s) for diagram number 713
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-
-      // *** DIAGRAM 714 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 714
-      // (none)
-
-      // Amplitude(s) for diagram number 714
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 715 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 715
-      // (none)
-
-      // Amplitude(s) for diagram number 715
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 716 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 716
-      // (none)
-
-      // Amplitude(s) for diagram number 716
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 717 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 717
-      // (none)
-
-      // Amplitude(s) for diagram number 717
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-
-      // *** DIAGRAM 718 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 718
-      // (none)
-
-      // Amplitude(s) for diagram number 718
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 719 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 719
-      // (none)
-
-      // Amplitude(s) for diagram number 719
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 720 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 720
-      // (none)
-
-      // Amplitude(s) for diagram number 720
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 721 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 721
-      // (none)
-
-      // Amplitude(s) for diagram number 721
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 722 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 722
-      // (none)
-
-      // Amplitude(s) for diagram number 722
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 723 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 723
-      // (none)
-
-      // Amplitude(s) for diagram number 723
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 724 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 724
-      // (none)
-
-      // Amplitude(s) for diagram number 724
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-
-      // *** DIAGRAM 725 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 725
-      // (none)
-
-      // Amplitude(s) for diagram number 725
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 726 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 726
-      // (none)
-
-      // Amplitude(s) for diagram number 726
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 727 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 727
-      // (none)
-
-      // Amplitude(s) for diagram number 727
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-
-      // *** DIAGRAM 728 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 728
-      // (none)
-
-      // Amplitude(s) for diagram number 728
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 729 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 729
-      // (none)
-
-      // Amplitude(s) for diagram number 729
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 730 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 730
-      // (none)
-
-      // Amplitude(s) for diagram number 730
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 731 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 731
-      // (none)
-
-      // Amplitude(s) for diagram number 731
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 732 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 732
-      // (none)
-
-      // Amplitude(s) for diagram number 732
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 733 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 733
-      // (none)
-
-      // Amplitude(s) for diagram number 733
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-
-      // *** DIAGRAM 734 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 734
-      // (none)
-
-      // Amplitude(s) for diagram number 734
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 735 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 735
-      // (none)
-
-      // Amplitude(s) for diagram number 735
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-
-      // *** DIAGRAM 736 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 736
-      // (none)
-
-      // Amplitude(s) for diagram number 736
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 737 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 737
-      // (none)
-
-      // Amplitude(s) for diagram number 737
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 738 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 738
-      // (none)
-
-      // Amplitude(s) for diagram number 738
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 739 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 739
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] );
-
-      // Amplitude(s) for diagram number 739
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[29] -= amp_sv[0];
-
-      // *** DIAGRAM 740 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 740
-      // (none)
-
-      // Amplitude(s) for diagram number 740
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] -= amp_sv[0];
-
-      // *** DIAGRAM 741 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 741
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
-
-      // Amplitude(s) for diagram number 741
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] -= amp_sv[0];
-
-      // *** DIAGRAM 742 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 742
-      // (none)
-
-      // Amplitude(s) for diagram number 742
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] -= amp_sv[0];
-
-      // *** DIAGRAM 743 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 743
-      // (none)
-
-      // Amplitude(s) for diagram number 743
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] -= amp_sv[0];
-
-      // *** DIAGRAM 744 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 744
-      // (none)
-
-      // Amplitude(s) for diagram number 744
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] -= amp_sv[0];
-
-      // *** DIAGRAM 745 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 745
-      // (none)
-
-      // Amplitude(s) for diagram number 745
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 746 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 746
-      // (none)
-
-      // Amplitude(s) for diagram number 746
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 747 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 747
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] );
-
-      // Amplitude(s) for diagram number 747
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-
-      // *** DIAGRAM 748 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 748
-      // (none)
-
-      // Amplitude(s) for diagram number 748
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] -= amp_sv[0];
-
-      // *** DIAGRAM 749 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 749
-      // (none)
-
-      // Amplitude(s) for diagram number 749
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] -= amp_sv[0];
-
-      // *** DIAGRAM 750 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 750
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
-
-      // Amplitude(s) for diagram number 750
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] -= amp_sv[0];
-
-      // *** DIAGRAM 751 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 751
-      // (none)
-
-      // Amplitude(s) for diagram number 751
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] -= amp_sv[0];
-
-      // *** DIAGRAM 752 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 752
-      // (none)
-
-      // Amplitude(s) for diagram number 752
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] -= amp_sv[0];
-
-      // *** DIAGRAM 753 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 753
-      // (none)
-
-      // Amplitude(s) for diagram number 753
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] -= amp_sv[0];
-
-      // *** DIAGRAM 754 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 754
-      // (none)
-
-      // Amplitude(s) for diagram number 754
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 755 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 755
-      // (none)
-
-      // Amplitude(s) for diagram number 755
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 756 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 756
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] );
-
-      // Amplitude(s) for diagram number 756
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-
-      // *** DIAGRAM 757 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 757
-      // (none)
-
-      // Amplitude(s) for diagram number 757
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] -= amp_sv[0];
-
-      // *** DIAGRAM 758 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 758
-      // (none)
-
-      // Amplitude(s) for diagram number 758
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= amp_sv[0];
-
-      // *** DIAGRAM 759 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 759
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
-
-      // Amplitude(s) for diagram number 759
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] -= amp_sv[0];
-
-      // *** DIAGRAM 760 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 760
-      // (none)
-
-      // Amplitude(s) for diagram number 760
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] -= amp_sv[0];
-
-      // *** DIAGRAM 761 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 761
-      // (none)
-
-      // Amplitude(s) for diagram number 761
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] -= amp_sv[0];
-
-      // *** DIAGRAM 762 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 762
-      // (none)
-
-      // Amplitude(s) for diagram number 762
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] -= amp_sv[0];
-
-      // *** DIAGRAM 763 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 763
-      // (none)
-
-      // Amplitude(s) for diagram number 763
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 764 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 764
-      // (none)
-
-      // Amplitude(s) for diagram number 764
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 765 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 765
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] );
-
-      // Amplitude(s) for diagram number 765
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-
-      // *** DIAGRAM 766 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 766
-      // (none)
-
-      // Amplitude(s) for diagram number 766
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 767 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 767
-      // (none)
-
-      // Amplitude(s) for diagram number 767
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 768 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 768
-      // (none)
-
-      // Amplitude(s) for diagram number 768
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 769 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 769
-      // (none)
-
-      // Amplitude(s) for diagram number 769
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-
-      // *** DIAGRAM 770 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 770
-      // (none)
-
-      // Amplitude(s) for diagram number 770
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 771 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 771
-      // (none)
-
-      // Amplitude(s) for diagram number 771
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 772 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 772
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 772
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 773 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 773
-      // (none)
-
-      // Amplitude(s) for diagram number 773
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 774 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 774
-      // (none)
-
-      // Amplitude(s) for diagram number 774
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-
-      // *** DIAGRAM 775 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 775
-      // (none)
-
-      // Amplitude(s) for diagram number 775
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 776 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 776
-      // (none)
-
-      // Amplitude(s) for diagram number 776
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-
-      // *** DIAGRAM 777 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 777
-      // (none)
-
-      // Amplitude(s) for diagram number 777
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 778 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 778
-      // (none)
-
-      // Amplitude(s) for diagram number 778
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 779 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 779
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
-
-      // Amplitude(s) for diagram number 779
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 780 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 780
-      // (none)
-
-      // Amplitude(s) for diagram number 780
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 781 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 781
-      // (none)
-
-      // Amplitude(s) for diagram number 781
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-
-      // *** DIAGRAM 782 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 782
-      // (none)
-
-      // Amplitude(s) for diagram number 782
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 783 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 783
-      // (none)
-
-      // Amplitude(s) for diagram number 783
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-
-      // *** DIAGRAM 784 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 784
-      // (none)
-
-      // Amplitude(s) for diagram number 784
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 785 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 785
-      // (none)
-
-      // Amplitude(s) for diagram number 785
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 786 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 786
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] );
-
-      // Amplitude(s) for diagram number 786
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 787 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 787
-      // (none)
-
-      // Amplitude(s) for diagram number 787
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-
-      // *** DIAGRAM 788 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 788
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] );
-
-      // Amplitude(s) for diagram number 788
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 789 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 789
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
-
-      // Amplitude(s) for diagram number 789
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] -= amp_sv[0];
-
-      // *** DIAGRAM 790 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 790
-      // (none)
-
-      // Amplitude(s) for diagram number 790
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] -= amp_sv[0];
-
-      // *** DIAGRAM 791 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 791
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
-
-      // Amplitude(s) for diagram number 791
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[53] -= amp_sv[0];
-
-      // *** DIAGRAM 792 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 792
-      // (none)
-
-      // Amplitude(s) for diagram number 792
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] -= amp_sv[0];
-
-      // *** DIAGRAM 793 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 793
-      // (none)
-
-      // Amplitude(s) for diagram number 793
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] -= amp_sv[0];
-
-      // *** DIAGRAM 794 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 794
-      // (none)
-
-      // Amplitude(s) for diagram number 794
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] -= amp_sv[0];
-
-      // *** DIAGRAM 795 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 795
-      // (none)
-
-      // Amplitude(s) for diagram number 795
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 796 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 796
-      // (none)
-
-      // Amplitude(s) for diagram number 796
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 797 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 797
-      // (none)
-
-      // Amplitude(s) for diagram number 797
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-
-      // *** DIAGRAM 798 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 798
-      // (none)
-
-      // Amplitude(s) for diagram number 798
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[88] -= amp_sv[0];
-
-      // *** DIAGRAM 799 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 799
-      // (none)
-
-      // Amplitude(s) for diagram number 799
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[94] -= amp_sv[0];
-
-      // *** DIAGRAM 800 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 800
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
-
-      // Amplitude(s) for diagram number 800
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[77] -= amp_sv[0];
-
-      // *** DIAGRAM 801 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 801
-      // (none)
-
-      // Amplitude(s) for diagram number 801
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] -= amp_sv[0];
-
-      // *** DIAGRAM 802 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 802
-      // (none)
-
-      // Amplitude(s) for diagram number 802
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] -= amp_sv[0];
-
-      // *** DIAGRAM 803 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 803
-      // (none)
-
-      // Amplitude(s) for diagram number 803
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[85] -= amp_sv[0];
-
-      // *** DIAGRAM 804 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 804
-      // (none)
-
-      // Amplitude(s) for diagram number 804
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 805 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 805
-      // (none)
-
-      // Amplitude(s) for diagram number 805
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 806 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 806
-      // (none)
-
-      // Amplitude(s) for diagram number 806
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-
-      // *** DIAGRAM 807 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 807
-      // (none)
-
-      // Amplitude(s) for diagram number 807
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 808 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 808
-      // (none)
-
-      // Amplitude(s) for diagram number 808
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 809 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 809
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
-
-      // Amplitude(s) for diagram number 809
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 810 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 810
-      // (none)
-
-      // Amplitude(s) for diagram number 810
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] -= amp_sv[0];
-
-      // *** DIAGRAM 811 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 811
-      // (none)
-
-      // Amplitude(s) for diagram number 811
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 812 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 812
-      // (none)
-
-      // Amplitude(s) for diagram number 812
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[109] -= amp_sv[0];
-
-      // *** DIAGRAM 813 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 813
-      // (none)
-
-      // Amplitude(s) for diagram number 813
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 814 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 814
-      // (none)
-
-      // Amplitude(s) for diagram number 814
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 815 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 815
-      // (none)
-
-      // Amplitude(s) for diagram number 815
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 816 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 816
-      // (none)
-
-      // Amplitude(s) for diagram number 816
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 817 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 817
-      // (none)
-
-      // Amplitude(s) for diagram number 817
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 818 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 818
-      // (none)
-
-      // Amplitude(s) for diagram number 818
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 819 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 819
-      // (none)
-
-      // Amplitude(s) for diagram number 819
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 820 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 820
-      // (none)
-
-      // Amplitude(s) for diagram number 820
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 821 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 821
-      // (none)
-
-      // Amplitude(s) for diagram number 821
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 822 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 822
-      // (none)
-
-      // Amplitude(s) for diagram number 822
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 823 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 823
-      // (none)
-
-      // Amplitude(s) for diagram number 823
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 824 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 824
-      // (none)
-
-      // Amplitude(s) for diagram number 824
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-
-      // *** DIAGRAM 825 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 825
-      // (none)
-
-      // Amplitude(s) for diagram number 825
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 826 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 826
-      // (none)
-
-      // Amplitude(s) for diagram number 826
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 827 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 827
-      // (none)
-
-      // Amplitude(s) for diagram number 827
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 828 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 828
-      // (none)
-
-      // Amplitude(s) for diagram number 828
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 829 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 829
-      // (none)
-
-      // Amplitude(s) for diagram number 829
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 830 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 830
-      // (none)
-
-      // Amplitude(s) for diagram number 830
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 831 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 831
-      // (none)
-
-      // Amplitude(s) for diagram number 831
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 832 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 832
-      // (none)
-
-      // Amplitude(s) for diagram number 832
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 833 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 833
-      // (none)
-
-      // Amplitude(s) for diagram number 833
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 834 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 834
-      // (none)
-
-      // Amplitude(s) for diagram number 834
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 835 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 835
-      // (none)
-
-      // Amplitude(s) for diagram number 835
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 836 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 836
-      // (none)
-
-      // Amplitude(s) for diagram number 836
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 837 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 837
-      // (none)
-
-      // Amplitude(s) for diagram number 837
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 838 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 838
-      // (none)
-
-      // Amplitude(s) for diagram number 838
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 839 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 839
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] );
-
-      // Amplitude(s) for diagram number 839
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 840 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 840
-      // (none)
-
-      // Amplitude(s) for diagram number 840
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 841 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 841
-      // (none)
-
-      // Amplitude(s) for diagram number 841
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 842 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 842
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] );
-
-      // Amplitude(s) for diagram number 842
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 843 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 843
-      // (none)
-
-      // Amplitude(s) for diagram number 843
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 844 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 844
-      // (none)
-
-      // Amplitude(s) for diagram number 844
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 845 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 845
-      // (none)
-
-      // Amplitude(s) for diagram number 845
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 846 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 846
-      // (none)
-
-      // Amplitude(s) for diagram number 846
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 847 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 847
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 847
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 848 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 848
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] );
-
-      // Amplitude(s) for diagram number 848
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 849 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 849
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] );
-
-      // Amplitude(s) for diagram number 849
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 850 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 850
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] );
-
-      // Amplitude(s) for diagram number 850
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 851 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 851
-      // (none)
-
-      // Amplitude(s) for diagram number 851
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 852 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 852
-      // (none)
-
-      // Amplitude(s) for diagram number 852
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 853 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 853
-      // (none)
-
-      // Amplitude(s) for diagram number 853
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 854 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 854
-      // (none)
-
-      // Amplitude(s) for diagram number 854
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 855 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 855
-      // (none)
-
-      // Amplitude(s) for diagram number 855
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 856 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 856
-      // (none)
-
-      // Amplitude(s) for diagram number 856
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-
-      // *** DIAGRAM 857 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 857
-      // (none)
-
-      // Amplitude(s) for diagram number 857
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 858 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 858
-      // (none)
-
-      // Amplitude(s) for diagram number 858
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 859 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 859
-      // (none)
-
-      // Amplitude(s) for diagram number 859
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 860 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 860
-      // (none)
-
-      // Amplitude(s) for diagram number 860
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 861 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 861
-      // (none)
-
-      // Amplitude(s) for diagram number 861
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 862 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 862
-      // (none)
-
-      // Amplitude(s) for diagram number 862
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 863 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 863
-      // (none)
-
-      // Amplitude(s) for diagram number 863
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 864 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 864
-      // (none)
-
-      // Amplitude(s) for diagram number 864
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 865 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 865
-      // (none)
-
-      // Amplitude(s) for diagram number 865
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 866 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 866
-      // (none)
-
-      // Amplitude(s) for diagram number 866
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 867 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 867
-      // (none)
-
-      // Amplitude(s) for diagram number 867
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 868 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 868
-      // (none)
-
-      // Amplitude(s) for diagram number 868
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 869 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 869
-      // (none)
-
-      // Amplitude(s) for diagram number 869
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 870 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 870
-      // (none)
-
-      // Amplitude(s) for diagram number 870
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 871 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 871
-      // (none)
-
-      // Amplitude(s) for diagram number 871
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 872 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 872
-      // (none)
-
-      // Amplitude(s) for diagram number 872
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 873 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 873
-      // (none)
-
-      // Amplitude(s) for diagram number 873
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 874 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 874
-      // (none)
-
-      // Amplitude(s) for diagram number 874
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 875 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 875
-      // (none)
-
-      // Amplitude(s) for diagram number 875
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 876 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 876
-      // (none)
-
-      // Amplitude(s) for diagram number 876
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 877 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 877
-      // (none)
-
-      // Amplitude(s) for diagram number 877
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 878 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 878
-      // (none)
-
-      // Amplitude(s) for diagram number 878
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 879 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 879
-      // (none)
-
-      // Amplitude(s) for diagram number 879
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 880 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 880
-      // (none)
-
-      // Amplitude(s) for diagram number 880
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 881 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 881
-      // (none)
-
-      // Amplitude(s) for diagram number 881
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 882 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 882
-      // (none)
-
-      // Amplitude(s) for diagram number 882
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 883 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 883
-      // (none)
-
-      // Amplitude(s) for diagram number 883
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-
-      // *** DIAGRAM 884 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 884
-      // (none)
-
-      // Amplitude(s) for diagram number 884
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 885 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 885
-      // (none)
-
-      // Amplitude(s) for diagram number 885
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 886 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 886
-      // (none)
-
-      // Amplitude(s) for diagram number 886
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 887 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 887
-      // (none)
-
-      // Amplitude(s) for diagram number 887
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 888 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 888
-      // (none)
-
-      // Amplitude(s) for diagram number 888
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 889 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 889
-      // (none)
-
-      // Amplitude(s) for diagram number 889
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 890 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 890
-      // (none)
-
-      // Amplitude(s) for diagram number 890
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 891 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 891
-      // (none)
-
-      // Amplitude(s) for diagram number 891
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 892 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 892
-      // (none)
-
-      // Amplitude(s) for diagram number 892
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 893 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 893
-      // (none)
-
-      // Amplitude(s) for diagram number 893
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-
-      // *** DIAGRAM 894 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 894
-      // (none)
-
-      // Amplitude(s) for diagram number 894
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 895 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 895
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] );
-
-      // Amplitude(s) for diagram number 895
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 896 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 896
-      // (none)
-
-      // Amplitude(s) for diagram number 896
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 897 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 897
-      // (none)
-
-      // Amplitude(s) for diagram number 897
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 898 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 898
-      // (none)
-
-      // Amplitude(s) for diagram number 898
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 899 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 899
-      // (none)
-
-      // Amplitude(s) for diagram number 899
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 900 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 900
-      // (none)
-
-      // Amplitude(s) for diagram number 900
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 901 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 901
-      // (none)
-
-      // Amplitude(s) for diagram number 901
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 902 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 902
-      // (none)
-
-      // Amplitude(s) for diagram number 902
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 903 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 903
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 903
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 904 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 904
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] );
-
-      // Amplitude(s) for diagram number 904
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 905 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 905
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
-
-      // Amplitude(s) for diagram number 905
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 906 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 906
-      // (none)
-
-      // Amplitude(s) for diagram number 906
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 907 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 907
-      // (none)
-
-      // Amplitude(s) for diagram number 907
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 908 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 908
-      // (none)
-
-      // Amplitude(s) for diagram number 908
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 909 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 909
-      // (none)
-
-      // Amplitude(s) for diagram number 909
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 910 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 910
-      // (none)
-
-      // Amplitude(s) for diagram number 910
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 911 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 911
-      // (none)
-
-      // Amplitude(s) for diagram number 911
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 912 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 912
-      // (none)
-
-      // Amplitude(s) for diagram number 912
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-
-      // *** DIAGRAM 913 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 913
-      // (none)
-
-      // Amplitude(s) for diagram number 913
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 914 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 914
-      // (none)
-
-      // Amplitude(s) for diagram number 914
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 915 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 915
-      // (none)
-
-      // Amplitude(s) for diagram number 915
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 916 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 916
-      // (none)
-
-      // Amplitude(s) for diagram number 916
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 917 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 917
-      // (none)
-
-      // Amplitude(s) for diagram number 917
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 918 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 918
-      // (none)
-
-      // Amplitude(s) for diagram number 918
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-
-      // *** DIAGRAM 919 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 919
-      // (none)
-
-      // Amplitude(s) for diagram number 919
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 920 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 920
-      // (none)
-
-      // Amplitude(s) for diagram number 920
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 921 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 921
-      // (none)
-
-      // Amplitude(s) for diagram number 921
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 922 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 922
-      // (none)
-
-      // Amplitude(s) for diagram number 922
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 923 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 923
-      // (none)
-
-      // Amplitude(s) for diagram number 923
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 924 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 924
-      // (none)
-
-      // Amplitude(s) for diagram number 924
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 925 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 925
-      // (none)
-
-      // Amplitude(s) for diagram number 925
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 926 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 926
-      // (none)
-
-      // Amplitude(s) for diagram number 926
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 927 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 927
-      // (none)
-
-      // Amplitude(s) for diagram number 927
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 928 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 928
-      // (none)
-
-      // Amplitude(s) for diagram number 928
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 929 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 929
-      // (none)
-
-      // Amplitude(s) for diagram number 929
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 930 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 930
-      // (none)
-
-      // Amplitude(s) for diagram number 930
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 931 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 931
-      // (none)
-
-      // Amplitude(s) for diagram number 931
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 932 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 932
-      // (none)
-
-      // Amplitude(s) for diagram number 932
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 933 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 933
-      // (none)
-
-      // Amplitude(s) for diagram number 933
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 934 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 934
-      // (none)
-
-      // Amplitude(s) for diagram number 934
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 935 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 935
-      // (none)
-
-      // Amplitude(s) for diagram number 935
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 936 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 936
-      // (none)
-
-      // Amplitude(s) for diagram number 936
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 937 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 937
-      // (none)
-
-      // Amplitude(s) for diagram number 937
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 938 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 938
-      // (none)
-
-      // Amplitude(s) for diagram number 938
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 939 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 939
-      // (none)
-
-      // Amplitude(s) for diagram number 939
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-
-      // *** DIAGRAM 940 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 940
-      // (none)
-
-      // Amplitude(s) for diagram number 940
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 941 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 941
-      // (none)
-
-      // Amplitude(s) for diagram number 941
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-
-      // *** DIAGRAM 942 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 942
-      // (none)
-
-      // Amplitude(s) for diagram number 942
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 943 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 943
-      // (none)
-
-      // Amplitude(s) for diagram number 943
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 944 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 944
-      // (none)
-
-      // Amplitude(s) for diagram number 944
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 945 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 945
-      // (none)
-
-      // Amplitude(s) for diagram number 945
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 946 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 946
-      // (none)
-
-      // Amplitude(s) for diagram number 946
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 947 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 947
-      // (none)
-
-      // Amplitude(s) for diagram number 947
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 948 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 948
-      // (none)
-
-      // Amplitude(s) for diagram number 948
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 949 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 949
-      // (none)
-
-      // Amplitude(s) for diagram number 949
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-
-      // *** DIAGRAM 950 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 950
-      // (none)
-
-      // Amplitude(s) for diagram number 950
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 951 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 951
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] );
-
-      // Amplitude(s) for diagram number 951
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 952 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 952
-      // (none)
-
-      // Amplitude(s) for diagram number 952
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 953 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 953
-      // (none)
-
-      // Amplitude(s) for diagram number 953
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 954 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 954
-      // (none)
-
-      // Amplitude(s) for diagram number 954
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-
-      // *** DIAGRAM 955 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 955
-      // (none)
-
-      // Amplitude(s) for diagram number 955
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 956 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 956
-      // (none)
-
-      // Amplitude(s) for diagram number 956
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 957 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 957
-      // (none)
-
-      // Amplitude(s) for diagram number 957
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-
-      // *** DIAGRAM 958 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 958
-      // (none)
-
-      // Amplitude(s) for diagram number 958
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-
-      // *** DIAGRAM 959 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 959
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 959
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 960 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 960
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] );
-
-      // Amplitude(s) for diagram number 960
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 961 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 961
-      // (none)
-
-      // Amplitude(s) for diagram number 961
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 962 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 962
-      // (none)
-
-      // Amplitude(s) for diagram number 962
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-
-      // *** DIAGRAM 963 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 963
-      // (none)
-
-      // Amplitude(s) for diagram number 963
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 964 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 964
-      // (none)
-
-      // Amplitude(s) for diagram number 964
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 965 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 965
-      // (none)
-
-      // Amplitude(s) for diagram number 965
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 966 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 966
-      // (none)
-
-      // Amplitude(s) for diagram number 966
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 967 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 967
-      // (none)
-
-      // Amplitude(s) for diagram number 967
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 968 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 968
-      // (none)
-
-      // Amplitude(s) for diagram number 968
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-
-      // *** DIAGRAM 969 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 969
-      // (none)
-
-      // Amplitude(s) for diagram number 969
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 970 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 970
-      // (none)
-
-      // Amplitude(s) for diagram number 970
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-
-      // *** DIAGRAM 971 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 971
-      // (none)
-
-      // Amplitude(s) for diagram number 971
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 972 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 972
-      // (none)
-
-      // Amplitude(s) for diagram number 972
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 973 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 973
-      // (none)
-
-      // Amplitude(s) for diagram number 973
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 974 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 974
-      // (none)
-
-      // Amplitude(s) for diagram number 974
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-
-      // *** DIAGRAM 975 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 975
-      // (none)
-
-      // Amplitude(s) for diagram number 975
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 976 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 976
-      // (none)
-
-      // Amplitude(s) for diagram number 976
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 977 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 977
-      // (none)
-
-      // Amplitude(s) for diagram number 977
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 978 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 978
-      // (none)
-
-      // Amplitude(s) for diagram number 978
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-
-      // *** DIAGRAM 979 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 979
-      // (none)
-
-      // Amplitude(s) for diagram number 979
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 980 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 980
-      // (none)
-
-      // Amplitude(s) for diagram number 980
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-
-      // *** DIAGRAM 981 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 981
-      // (none)
-
-      // Amplitude(s) for diagram number 981
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 982 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 982
-      // (none)
-
-      // Amplitude(s) for diagram number 982
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 983 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 983
-      // (none)
-
-      // Amplitude(s) for diagram number 983
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 984 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 984
-      // (none)
-
-      // Amplitude(s) for diagram number 984
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-
-      // *** DIAGRAM 985 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 985
-      // (none)
-
-      // Amplitude(s) for diagram number 985
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 986 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 986
-      // (none)
-
-      // Amplitude(s) for diagram number 986
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 987 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 987
-      // (none)
-
-      // Amplitude(s) for diagram number 987
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 988 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 988
-      // (none)
-
-      // Amplitude(s) for diagram number 988
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 989 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 989
-      // (none)
-
-      // Amplitude(s) for diagram number 989
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 990 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 990
-      // (none)
-
-      // Amplitude(s) for diagram number 990
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 991 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 991
-      // (none)
-
-      // Amplitude(s) for diagram number 991
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 992 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 992
-      // (none)
-
-      // Amplitude(s) for diagram number 992
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 993 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 993
-      // (none)
-
-      // Amplitude(s) for diagram number 993
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 994 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 994
-      // (none)
-
-      // Amplitude(s) for diagram number 994
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 995 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 995
-      // (none)
-
-      // Amplitude(s) for diagram number 995
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-
-      // *** DIAGRAM 996 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 996
-      // (none)
-
-      // Amplitude(s) for diagram number 996
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 997 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 997
-      // (none)
-
-      // Amplitude(s) for diagram number 997
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-
-      // *** DIAGRAM 998 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 998
-      // (none)
-
-      // Amplitude(s) for diagram number 998
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 999 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 999
-      // (none)
-
-      // Amplitude(s) for diagram number 999
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1000 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1000
-      // (none)
-
-      // Amplitude(s) for diagram number 1000
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1001 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1001
-      // (none)
-
-      // Amplitude(s) for diagram number 1001
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1002 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1002
-      // (none)
-
-      // Amplitude(s) for diagram number 1002
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1003 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1003
-      // (none)
-
-      // Amplitude(s) for diagram number 1003
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1004 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1004
-      // (none)
-
-      // Amplitude(s) for diagram number 1004
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1005 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1005
-      // (none)
-
-      // Amplitude(s) for diagram number 1005
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 1006 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1006
-      // (none)
-
-      // Amplitude(s) for diagram number 1006
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-
-      // *** DIAGRAM 1007 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1007
-      // (none)
-
-      // Amplitude(s) for diagram number 1007
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1008 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1008
-      // (none)
-
-      // Amplitude(s) for diagram number 1008
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1009 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1009
-      // (none)
-
-      // Amplitude(s) for diagram number 1009
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1010 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1010
-      // (none)
-
-      // Amplitude(s) for diagram number 1010
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1011 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1011
-      // (none)
-
-      // Amplitude(s) for diagram number 1011
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1012 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1012
-      // (none)
-
-      // Amplitude(s) for diagram number 1012
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 1013 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1013
-      // (none)
-
-      // Amplitude(s) for diagram number 1013
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1014 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1014
-      // (none)
-
-      // Amplitude(s) for diagram number 1014
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1015 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1015
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] );
-
-      // Amplitude(s) for diagram number 1015
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1016 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1016
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 1016
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1017 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1017
-      // (none)
-
-      // Amplitude(s) for diagram number 1017
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1018 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1018
-      // (none)
-
-      // Amplitude(s) for diagram number 1018
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1019 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1019
-      // (none)
-
-      // Amplitude(s) for diagram number 1019
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 1020 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1020
-      // (none)
-
-      // Amplitude(s) for diagram number 1020
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 1021 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1021
-      // (none)
-
-      // Amplitude(s) for diagram number 1021
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 1022 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1022
-      // (none)
-
-      // Amplitude(s) for diagram number 1022
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1023 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1023
-      // (none)
-
-      // Amplitude(s) for diagram number 1023
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 1024 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1024
-      // (none)
-
-      // Amplitude(s) for diagram number 1024
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1025 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1025
-      // (none)
-
-      // Amplitude(s) for diagram number 1025
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-
-      // *** DIAGRAM 1026 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1026
-      // (none)
-
-      // Amplitude(s) for diagram number 1026
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1027 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1027
-      // (none)
-
-      // Amplitude(s) for diagram number 1027
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-
-      // *** DIAGRAM 1028 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1028
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 1028
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1029 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1029
-      // (none)
-
-      // Amplitude(s) for diagram number 1029
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-
-      // *** DIAGRAM 1030 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1030
-      // (none)
-
-      // Amplitude(s) for diagram number 1030
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1031 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1031
-      // (none)
-
-      // Amplitude(s) for diagram number 1031
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 1032 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1032
-      // (none)
-
-      // Amplitude(s) for diagram number 1032
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-
-      // *** DIAGRAM 1033 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1033
-      // (none)
-
-      // Amplitude(s) for diagram number 1033
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1034 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1034
-      // (none)
-
-      // Amplitude(s) for diagram number 1034
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 1035 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1035
-      // (none)
-
-      // Amplitude(s) for diagram number 1035
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 1036 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1036
-      // (none)
-
-      // Amplitude(s) for diagram number 1036
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1037 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1037
-      // (none)
-
-      // Amplitude(s) for diagram number 1037
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1038 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1038
-      // (none)
-
-      // Amplitude(s) for diagram number 1038
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 1039 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1039
-      // (none)
-
-      // Amplitude(s) for diagram number 1039
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1040 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1040
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 1040
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1041 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1041
-      // (none)
-
-      // Amplitude(s) for diagram number 1041
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-
-      // *** DIAGRAM 1042 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1042
-      // (none)
-
-      // Amplitude(s) for diagram number 1042
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1043 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1043
-      // (none)
-
-      // Amplitude(s) for diagram number 1043
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1044 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1044
-      // (none)
-
-      // Amplitude(s) for diagram number 1044
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1045 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1045
-      // (none)
-
-      // Amplitude(s) for diagram number 1045
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1046 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1046
-      // (none)
-
-      // Amplitude(s) for diagram number 1046
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] -= amp_sv[0];
-
-      // *** DIAGRAM 1047 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1047
-      // (none)
-
-      // Amplitude(s) for diagram number 1047
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] -= amp_sv[0];
-
-      // *** DIAGRAM 1048 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1048
-      // (none)
-
-      // Amplitude(s) for diagram number 1048
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[58] -= amp_sv[0];
-
-      // *** DIAGRAM 1049 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1049
-      // (none)
-
-      // Amplitude(s) for diagram number 1049
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[68] -= amp_sv[0];
-
-      // *** DIAGRAM 1050 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1050
-      // (none)
-
-      // Amplitude(s) for diagram number 1050
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[55] -= amp_sv[0];
-
-      // *** DIAGRAM 1051 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1051
-      // (none)
-
-      // Amplitude(s) for diagram number 1051
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] -= amp_sv[0];
-
-      // *** DIAGRAM 1052 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1052
-      // (none)
-
-      // Amplitude(s) for diagram number 1052
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] -= amp_sv[0];
-
-      // *** DIAGRAM 1053 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1053
-      // (none)
-
-      // Amplitude(s) for diagram number 1053
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= amp_sv[0];
-
-      // *** DIAGRAM 1054 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1054
-      // (none)
-
-      // Amplitude(s) for diagram number 1054
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[56] -= amp_sv[0];
-
-      // *** DIAGRAM 1055 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1055
-      // (none)
-
-      // Amplitude(s) for diagram number 1055
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[62] -= amp_sv[0];
-
-      // *** DIAGRAM 1056 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1056
-      // (none)
-
-      // Amplitude(s) for diagram number 1056
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[54] -= amp_sv[0];
-
-      // *** DIAGRAM 1057 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1057
-      // (none)
-
-      // Amplitude(s) for diagram number 1057
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] -= amp_sv[0];
-
-      // *** DIAGRAM 1058 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1058
-      // (none)
-
-      // Amplitude(s) for diagram number 1058
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-
-      // *** DIAGRAM 1059 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1059
-      // (none)
-
-      // Amplitude(s) for diagram number 1059
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1060 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1060
-      // (none)
-
-      // Amplitude(s) for diagram number 1060
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-
-      // *** DIAGRAM 1061 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1061
-      // (none)
-
-      // Amplitude(s) for diagram number 1061
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1062 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1062
-      // (none)
-
-      // Amplitude(s) for diagram number 1062
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1063 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1063
-      // (none)
-
-      // Amplitude(s) for diagram number 1063
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1064 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1064
-      // (none)
-
-      // Amplitude(s) for diagram number 1064
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1065 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1065
-      // (none)
-
-      // Amplitude(s) for diagram number 1065
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] -= amp_sv[0];
-
-      // *** DIAGRAM 1066 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1066
-      // (none)
-
-      // Amplitude(s) for diagram number 1066
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] -= amp_sv[0];
-
-      // *** DIAGRAM 1067 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1067
-      // (none)
-
-      // Amplitude(s) for diagram number 1067
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[82] -= amp_sv[0];
-
-      // *** DIAGRAM 1068 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1068
-      // (none)
-
-      // Amplitude(s) for diagram number 1068
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[92] -= amp_sv[0];
-
-      // *** DIAGRAM 1069 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1069
-      // (none)
-
-      // Amplitude(s) for diagram number 1069
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[79] -= amp_sv[0];
-
-      // *** DIAGRAM 1070 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1070
-      // (none)
-
-      // Amplitude(s) for diagram number 1070
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] -= amp_sv[0];
-
-      // *** DIAGRAM 1071 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1071
-      // (none)
-
-      // Amplitude(s) for diagram number 1071
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] -= amp_sv[0];
-
-      // *** DIAGRAM 1072 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1072
-      // (none)
-
-      // Amplitude(s) for diagram number 1072
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= amp_sv[0];
-
-      // *** DIAGRAM 1073 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1073
-      // (none)
-
-      // Amplitude(s) for diagram number 1073
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[80] -= amp_sv[0];
-
-      // *** DIAGRAM 1074 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1074
-      // (none)
-
-      // Amplitude(s) for diagram number 1074
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[86] -= amp_sv[0];
-
-      // *** DIAGRAM 1075 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1075
-      // (none)
-
-      // Amplitude(s) for diagram number 1075
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[78] -= amp_sv[0];
-
-      // *** DIAGRAM 1076 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1076
-      // (none)
-
-      // Amplitude(s) for diagram number 1076
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[84] -= amp_sv[0];
-
-      // *** DIAGRAM 1077 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1077
-      // (none)
-
-      // Amplitude(s) for diagram number 1077
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-
-      // *** DIAGRAM 1078 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1078
-      // (none)
-
-      // Amplitude(s) for diagram number 1078
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1079 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1079
-      // (none)
-
-      // Amplitude(s) for diagram number 1079
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-
-      // *** DIAGRAM 1080 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1080
-      // (none)
-
-      // Amplitude(s) for diagram number 1080
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1081 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1081
-      // (none)
-
-      // Amplitude(s) for diagram number 1081
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1082 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1082
-      // (none)
-
-      // Amplitude(s) for diagram number 1082
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1083 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1083
-      // (none)
-
-      // Amplitude(s) for diagram number 1083
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1084 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1084
-      // (none)
-
-      // Amplitude(s) for diagram number 1084
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] -= amp_sv[0];
-
-      // *** DIAGRAM 1085 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1085
-      // (none)
-
-      // Amplitude(s) for diagram number 1085
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] -= amp_sv[0];
-
-      // *** DIAGRAM 1086 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1086
-      // (none)
-
-      // Amplitude(s) for diagram number 1086
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[106] -= amp_sv[0];
-
-      // *** DIAGRAM 1087 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1087
-      // (none)
-
-      // Amplitude(s) for diagram number 1087
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 1088 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1088
-      // (none)
-
-      // Amplitude(s) for diagram number 1088
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1089 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1089
-      // (none)
-
-      // Amplitude(s) for diagram number 1089
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1090 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1090
-      // (none)
-
-      // Amplitude(s) for diagram number 1090
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] -= amp_sv[0];
-
-      // *** DIAGRAM 1091 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1091
-      // (none)
-
-      // Amplitude(s) for diagram number 1091
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= amp_sv[0];
-
-      // *** DIAGRAM 1092 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1092
-      // (none)
-
-      // Amplitude(s) for diagram number 1092
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[104] -= amp_sv[0];
-
-      // *** DIAGRAM 1093 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1093
-      // (none)
-
-      // Amplitude(s) for diagram number 1093
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1094 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1094
-      // (none)
-
-      // Amplitude(s) for diagram number 1094
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 1095 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1095
-      // (none)
-
-      // Amplitude(s) for diagram number 1095
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1096 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1096
-      // (none)
-
-      // Amplitude(s) for diagram number 1096
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-
-      // *** DIAGRAM 1097 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1097
-      // (none)
-
-      // Amplitude(s) for diagram number 1097
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1098 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1098
-      // (none)
-
-      // Amplitude(s) for diagram number 1098
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 1099 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1099
-      // (none)
-
-      // Amplitude(s) for diagram number 1099
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1100 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1100
-      // (none)
-
-      // Amplitude(s) for diagram number 1100
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1101 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1101
-      // (none)
-
-      // Amplitude(s) for diagram number 1101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1102 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1102
-      // (none)
-
-      // Amplitude(s) for diagram number 1102
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1103 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1103
-      // (none)
-
-      // Amplitude(s) for diagram number 1103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1104 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1104
-      // (none)
-
-      // Amplitude(s) for diagram number 1104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1105 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1105
-      // (none)
-
-      // Amplitude(s) for diagram number 1105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-
-      // *** DIAGRAM 1106 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1106
-      // (none)
-
-      // Amplitude(s) for diagram number 1106
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1107 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1107
-      // (none)
-
-      // Amplitude(s) for diagram number 1107
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1108 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1108
-      // (none)
-
-      // Amplitude(s) for diagram number 1108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1109 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1109
-      // (none)
-
-      // Amplitude(s) for diagram number 1109
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1110 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1110
-      // (none)
-
-      // Amplitude(s) for diagram number 1110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1111 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1111
-      // (none)
-
-      // Amplitude(s) for diagram number 1111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1112 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1112
-      // (none)
-
-      // Amplitude(s) for diagram number 1112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-
-      // *** DIAGRAM 1113 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1113
-      // (none)
-
-      // Amplitude(s) for diagram number 1113
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1114 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1114
-      // (none)
-
-      // Amplitude(s) for diagram number 1114
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1115 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1115
-      // (none)
-
-      // Amplitude(s) for diagram number 1115
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1116 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1116
-      // (none)
-
-      // Amplitude(s) for diagram number 1116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1117 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1117
-      // (none)
-
-      // Amplitude(s) for diagram number 1117
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-
-      // *** DIAGRAM 1118 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1118
-      // (none)
-
-      // Amplitude(s) for diagram number 1118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1119 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1119
-      // (none)
-
-      // Amplitude(s) for diagram number 1119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-
-      // *** DIAGRAM 1120 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1120
-      // (none)
-
-      // Amplitude(s) for diagram number 1120
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1121 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1121
-      // (none)
-
-      // Amplitude(s) for diagram number 1121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1122 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1122
-      // (none)
-
-      // Amplitude(s) for diagram number 1122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1123 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1123
-      // (none)
-
-      // Amplitude(s) for diagram number 1123
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1124 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1124
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] );
-
-      // Amplitude(s) for diagram number 1124
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1125 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1125
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] );
-
-      // Amplitude(s) for diagram number 1125
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1126 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1126
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 1126
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1127 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1127
-      // (none)
-
-      // Amplitude(s) for diagram number 1127
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1128 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1128
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
-
-      // Amplitude(s) for diagram number 1128
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[93] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-
-      // *** DIAGRAM 1129 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1129
-      // (none)
-
-      // Amplitude(s) for diagram number 1129
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1130 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1130
-      // (none)
-
-      // Amplitude(s) for diagram number 1130
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-
-      // *** DIAGRAM 1131 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1131
-      // (none)
-
-      // Amplitude(s) for diagram number 1131
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1132 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1132
-      // (none)
-
-      // Amplitude(s) for diagram number 1132
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1133 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1133
-      // (none)
-
-      // Amplitude(s) for diagram number 1133
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1134 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1134
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-
-      // Amplitude(s) for diagram number 1134
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-
-      // *** DIAGRAM 1135 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1135
-      // (none)
-
-      // Amplitude(s) for diagram number 1135
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1136 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1136
-      // (none)
-
-      // Amplitude(s) for diagram number 1136
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-
-      // *** DIAGRAM 1137 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1137
-      // (none)
-
-      // Amplitude(s) for diagram number 1137
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1138 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1138
-      // (none)
-
-      // Amplitude(s) for diagram number 1138
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1139 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1139
-      // (none)
-
-      // Amplitude(s) for diagram number 1139
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1140 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1140
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 1140
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1141 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1141
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 1141
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1142 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1142
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] );
-
-      // Amplitude(s) for diagram number 1142
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1143 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1143
-      // (none)
-
-      // Amplitude(s) for diagram number 1143
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 1144 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1144
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 1144
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[69] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-
-      // *** DIAGRAM 1145 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1145
-      // (none)
-
-      // Amplitude(s) for diagram number 1145
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1146 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1146
-      // (none)
-
-      // Amplitude(s) for diagram number 1146
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-
-      // *** DIAGRAM 1147 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1147
-      // (none)
-
-      // Amplitude(s) for diagram number 1147
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[113] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-
-      // *** DIAGRAM 1148 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1148
-      // (none)
-
-      // Amplitude(s) for diagram number 1148
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1149 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1149
-      // (none)
-
-      // Amplitude(s) for diagram number 1149
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** DIAGRAM 1150 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1150
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
-
-      // Amplitude(s) for diagram number 1150
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-
-      // *** DIAGRAM 1151 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1151
-      // (none)
-
-      // Amplitude(s) for diagram number 1151
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1152 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1152
-      // (none)
-
-      // Amplitude(s) for diagram number 1152
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-
-      // *** DIAGRAM 1153 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1153
-      // (none)
-
-      // Amplitude(s) for diagram number 1153
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1154 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1154
-      // (none)
-
-      // Amplitude(s) for diagram number 1154
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1155 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1155
-      // (none)
-
-      // Amplitude(s) for diagram number 1155
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1156 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1156
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
-
-      // Amplitude(s) for diagram number 1156
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1157 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1157
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 1157
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 1158 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1158
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] );
-
-      // Amplitude(s) for diagram number 1158
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1159 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1159
-      // (none)
-
-      // Amplitude(s) for diagram number 1159
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-
-      // *** DIAGRAM 1160 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1160
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-
-      // Amplitude(s) for diagram number 1160
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[65] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[63] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-
-      // *** DIAGRAM 1161 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1161
-      // (none)
-
-      // Amplitude(s) for diagram number 1161
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1162 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1162
-      // (none)
-
-      // Amplitude(s) for diagram number 1162
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-
-      // *** DIAGRAM 1163 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1163
-      // (none)
-
-      // Amplitude(s) for diagram number 1163
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[89] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-
-      // *** DIAGRAM 1164 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1164
-      // (none)
-
-      // Amplitude(s) for diagram number 1164
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1165 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1165
-      // (none)
-
-      // Amplitude(s) for diagram number 1165
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-
-      // *** DIAGRAM 1166 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1166
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
-
-      // Amplitude(s) for diagram number 1166
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-
-      // *** DIAGRAM 1167 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1167
-      // (none)
-
-      // Amplitude(s) for diagram number 1167
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1168 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1168
-      // (none)
-
-      // Amplitude(s) for diagram number 1168
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-
-      // *** DIAGRAM 1169 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1169
-      // (none)
-
-      // Amplitude(s) for diagram number 1169
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1170 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1170
-      // (none)
-
-      // Amplitude(s) for diagram number 1170
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1171 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1171
-      // (none)
-
-      // Amplitude(s) for diagram number 1171
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1172 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1172
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
-
-      // Amplitude(s) for diagram number 1172
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-
-      // *** DIAGRAM 1173 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1173
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
-
-      // Amplitude(s) for diagram number 1173
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1174 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1174
-      // (none)
-
-      // Amplitude(s) for diagram number 1174
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-
-      // *** DIAGRAM 1175 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1175
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 1175
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-
-      // *** DIAGRAM 1176 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1176
-      // (none)
-
-      // Amplitude(s) for diagram number 1176
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1177 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1177
-      // (none)
-
-      // Amplitude(s) for diagram number 1177
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1178 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1178
-      // (none)
-
-      // Amplitude(s) for diagram number 1178
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1179 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1179
-      // (none)
-
-      // Amplitude(s) for diagram number 1179
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[45] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[47] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1180 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1180
-      // (none)
-
-      // Amplitude(s) for diagram number 1180
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[45] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[60] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[103] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-
-      // *** DIAGRAM 1181 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1181
-      // (none)
-
-      // Amplitude(s) for diagram number 1181
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1182 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1182
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 1182
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[26] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[112] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-
-      // *** DIAGRAM 1183 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1183
-      // (none)
-
-      // Amplitude(s) for diagram number 1183
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[24] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[47] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[118] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[26] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[112] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[24] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      jamp_sv[118] -= amp_sv[0];
-
-      // *** DIAGRAM 1184 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1184
-      // (none)
-
-      // Amplitude(s) for diagram number 1184
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[103] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[112] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[102] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[118] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1185 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1185
-      // (none)
-
-      // Amplitude(s) for diagram number 1185
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[102] += amp_sv[0];
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[107] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[103] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[102] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-
-      // *** DIAGRAM 1186 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1186
-      // (none)
-
-      // Amplitude(s) for diagram number 1186
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[26] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[60] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[24] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[84] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1187 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1187
-      // (none)
-
-      // Amplitude(s) for diagram number 1187
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[84] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[60] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[84] -= amp_sv[0];
-
-      // *** DIAGRAM 1188 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1188
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
-
-      // Amplitude(s) for diagram number 1188
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-
-      // *** DIAGRAM 1189 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1189
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 1189
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1190 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1190
-      // (none)
-
-      // Amplitude(s) for diagram number 1190
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-
-      // *** DIAGRAM 1191 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1191
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 1191
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-
-      // *** DIAGRAM 1192 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1192
-      // (none)
-
-      // Amplitude(s) for diagram number 1192
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1193 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1193
-      // (none)
-
-      // Amplitude(s) for diagram number 1193
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-
-      // *** DIAGRAM 1194 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1194
-      // (none)
-
-      // Amplitude(s) for diagram number 1194
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[109] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1195 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1195
-      // (none)
-
-      // Amplitude(s) for diagram number 1195
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[39] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[36] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[41] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1196 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1196
-      // (none)
-
-      // Amplitude(s) for diagram number 1196
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[39] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[66] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[79] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-
-      // *** DIAGRAM 1197 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1197
-      // (none)
-
-      // Amplitude(s) for diagram number 1197
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1198 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1198
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] );
-
-      // Amplitude(s) for diagram number 1198
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[28] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[88] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[42] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[85] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1199 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1199
-      // (none)
-
-      // Amplitude(s) for diagram number 1199
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[25] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[36] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[94] += amp_sv[0];
-      jamp_sv[109] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[28] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[88] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[25] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[36] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[42] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[85] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[94] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[109] += amp_sv[0];
-
-      // *** DIAGRAM 1200 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1200
-      // (none)
-
-      // Amplitude(s) for diagram number 1200
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[79] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[88] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[78] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[85] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[94] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1201 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1201
-      // (none)
-
-      // Amplitude(s) for diagram number 1201
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[78] += amp_sv[0];
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[83] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[79] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[78] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-
-      // *** DIAGRAM 1202 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1202
-      // (none)
-
-      // Amplitude(s) for diagram number 1202
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[28] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[66] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[25] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[42] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[108] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1203 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1203
-      // (none)
-
-      // Amplitude(s) for diagram number 1203
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[108] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[66] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[108] -= amp_sv[0];
-
-      // *** DIAGRAM 1204 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1204
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
-
-      // Amplitude(s) for diagram number 1204
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-
-      // *** DIAGRAM 1205 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1205
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 1205
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[29] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1206 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1206
-      // (none)
-
-      // Amplitude(s) for diagram number 1206
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-
-      // *** DIAGRAM 1207 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1207
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
-
-      // Amplitude(s) for diagram number 1207
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 1208 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1208
-      // (none)
-
-      // Amplitude(s) for diagram number 1208
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1209 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1209
-      // (none)
-
-      // Amplitude(s) for diagram number 1209
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-
-      // *** DIAGRAM 1210 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1210
-      // (none)
-
-      // Amplitude(s) for diagram number 1210
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[91] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[115] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1211 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1211
-      // (none)
-
-      // Amplitude(s) for diagram number 1211
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[31] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[33] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[30] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[35] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1212 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1212
-      // (none)
-
-      // Amplitude(s) for diagram number 1212
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[31] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[55] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[90] += amp_sv[0];
-      jamp_sv[91] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-
-      // *** DIAGRAM 1213 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1213
-      // (none)
-
-      // Amplitude(s) for diagram number 1213
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 1214 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1214
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] );
-
-      // Amplitude(s) for diagram number 1214
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[29] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[64] += amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[37] -= amp_sv[0];
-      jamp_sv[43] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[61] -= amp_sv[0];
-      jamp_sv[67] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1215 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1215
-      // (none)
-
-      // Amplitude(s) for diagram number 1215
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[27] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[30] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[70] += amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[115] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[29] -= amp_sv[0];
-      jamp_sv[31] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[64] -= amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[91] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[27] -= amp_sv[0];
-      jamp_sv[30] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[37] += amp_sv[0];
-      jamp_sv[43] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[61] += amp_sv[0];
-      jamp_sv[67] += amp_sv[0];
-      jamp_sv[70] -= amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[115] += amp_sv[0];
-
-      // *** DIAGRAM 1216 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1216
-      // (none)
-
-      // Amplitude(s) for diagram number 1216
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[55] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[64] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[54] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[61] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[67] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[70] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1217 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1217
-      // (none)
-
-      // Amplitude(s) for diagram number 1217
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[54] += amp_sv[0];
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[59] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[55] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[54] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-
-      // *** DIAGRAM 1218 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1218
-      // (none)
-
-      // Amplitude(s) for diagram number 1218
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[29] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[90] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[27] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[37] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[43] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[114] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1219 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1219
-      // (none)
-
-      // Amplitude(s) for diagram number 1219
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[114] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[90] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[114] -= amp_sv[0];
-
-      // *** DIAGRAM 1220 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1220
-      // (none)
-
-      // Amplitude(s) for diagram number 1220
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 1221 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1221
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 1221
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[38] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[62] += amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[97] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[99] += amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[56] -= amp_sv[0];
-      jamp_sv[80] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[98] -= amp_sv[0];
-      jamp_sv[100] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1222 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1222
-      // (none)
-
-      // Amplitude(s) for diagram number 1222
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-      jamp_sv[33] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[87] -= amp_sv[0];
-      jamp_sv[105] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[119] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[39] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[113] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      jamp_sv[33] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      jamp_sv[119] -= amp_sv[0];
-
-      // *** DIAGRAM 1223 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1223
-      // (none)
-
-      // Amplitude(s) for diagram number 1223
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[99] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[107] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[113] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[98] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[100] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[101] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[105] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[111] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[117] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[119] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1224 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1224
-      // (none)
-
-      // Amplitude(s) for diagram number 1224
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] += amp_sv[0];
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[101] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[97] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[99] -= amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[96] -= amp_sv[0];
-      jamp_sv[98] += amp_sv[0];
-      jamp_sv[100] += amp_sv[0];
-      jamp_sv[101] -= amp_sv[0];
-
-      // *** DIAGRAM 1225 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1225
-      // (none)
-
-      // Amplitude(s) for diagram number 1225
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[38] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[62] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[32] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[56] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[80] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[86] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1226 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1226
-      // (none)
-
-      // Amplitude(s) for diagram number 1226
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] += amp_sv[0];
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[86] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[38] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[62] -= amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[32] -= amp_sv[0];
-      jamp_sv[56] += amp_sv[0];
-      jamp_sv[80] += amp_sv[0];
-      jamp_sv[86] -= amp_sv[0];
-
-      // *** DIAGRAM 1227 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1227
-      // (none)
-
-      // Amplitude(s) for diagram number 1227
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 1228 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1228
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] );
-
-      // Amplitude(s) for diagram number 1228
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[44] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[68] += amp_sv[0];
-      jamp_sv[73] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[75] += amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[58] -= amp_sv[0];
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[74] -= amp_sv[0];
-      jamp_sv[76] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[104] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-
-      // *** DIAGRAM 1229 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1229
-      // (none)
-
-      // Amplitude(s) for diagram number 1229
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[35] -= amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[81] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[95] += amp_sv[0];
-      jamp_sv[111] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[45] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[89] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[35] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[81] -= amp_sv[0];
-      jamp_sv[87] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[95] -= amp_sv[0];
-      jamp_sv[105] -= amp_sv[0];
-      jamp_sv[111] += amp_sv[0];
-
-      // *** DIAGRAM 1230 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1230
-      // (none)
-
-      // Amplitude(s) for diagram number 1230
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[75] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[83] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[89] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[74] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[76] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[77] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[81] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[87] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[93] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[95] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1231 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1231
-      // (none)
-
-      // Amplitude(s) for diagram number 1231
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] += amp_sv[0];
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[77] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[73] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[75] -= amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[72] -= amp_sv[0];
-      jamp_sv[74] += amp_sv[0];
-      jamp_sv[76] += amp_sv[0];
-      jamp_sv[77] -= amp_sv[0];
-
-      // *** DIAGRAM 1232 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1232
-      // (none)
-
-      // Amplitude(s) for diagram number 1232
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[44] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[68] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[34] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[58] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[104] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[110] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1233 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1233
-      // (none)
-
-      // Amplitude(s) for diagram number 1233
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] += amp_sv[0];
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[110] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[44] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[68] -= amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[34] -= amp_sv[0];
-      jamp_sv[58] += amp_sv[0];
-      jamp_sv[104] += amp_sv[0];
-      jamp_sv[110] -= amp_sv[0];
-
-      // *** DIAGRAM 1234 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1234
-      // (none)
-
-      // Amplitude(s) for diagram number 1234
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 1235 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1235
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] );
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] );
-
-      // Amplitude(s) for diagram number 1235
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[46] += amp_sv[0];
-      jamp_sv[49] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[51] += amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[92] += amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[50] -= amp_sv[0];
-      jamp_sv[52] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[82] -= amp_sv[0];
-      jamp_sv[106] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-
-      // *** DIAGRAM 1236 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1236
-      // (none)
-
-      // Amplitude(s) for diagram number 1236
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[41] -= amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[57] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[71] += amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[117] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[47] += amp_sv[0];
-      jamp_sv[59] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[65] -= amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[93] += amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[41] += amp_sv[0];
-      jamp_sv[57] -= amp_sv[0];
-      jamp_sv[63] += amp_sv[0];
-      jamp_sv[69] += amp_sv[0];
-      jamp_sv[71] -= amp_sv[0];
-      jamp_sv[83] -= amp_sv[0];
-      jamp_sv[107] -= amp_sv[0];
-      jamp_sv[117] += amp_sv[0];
-
-      // *** DIAGRAM 1237 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1237
-      // (none)
-
-      // Amplitude(s) for diagram number 1237
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[51] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[59] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[65] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[50] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[52] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[53] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[57] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[63] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[69] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[71] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1238 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1238
-      // (none)
-
-      // Amplitude(s) for diagram number 1238
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] += amp_sv[0];
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[53] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[49] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[51] -= amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[48] -= amp_sv[0];
-      jamp_sv[50] += amp_sv[0];
-      jamp_sv[52] += amp_sv[0];
-      jamp_sv[53] -= amp_sv[0];
-
-      // *** DIAGRAM 1239 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1239
-      // (none)
-
-      // Amplitude(s) for diagram number 1239
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[46] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[92] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[40] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[82] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[106] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[116] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 1240 OF 1240 ***
-
-      // Wavefunction(s) for diagram number 1240
-      // (none)
-
-      // Amplitude(s) for diagram number 1240
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] += amp_sv[0];
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[116] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[46] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[92] -= amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[40] -= amp_sv[0];
-      jamp_sv[82] += amp_sv[0];
-      jamp_sv[106] += amp_sv[0];
-      jamp_sv[116] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxggg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
-
-      // The color matrix (initialize all array elements, with ncolor=120)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
-        { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
-        { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
-        { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
-        { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
-        { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
-        { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
-        { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
-        { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
-        { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
-        { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
-        { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
-        { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
-        { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
-        { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
-        { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
-        { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
-        { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
-        { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
-        { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
-        { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
-        { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
-        { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
-        { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
-        { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
-        { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
-        { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
-        { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
-        { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
-        { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
-        { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
-        { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
-        { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
-        { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
-        { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
-        { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
-        { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
-        { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
-        { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
-        { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
-        { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
-        { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
-        { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
-        { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
-        { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
-        { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
-        { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
-        { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
-        { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
-        { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
-        { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
-        { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
-        { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
-        { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
-        { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
-        { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
-        { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
-        { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
-        { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
-        { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
-        { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
-        { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
-        { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
-        { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
-        { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
-        { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
-        { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
-        { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
-        { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
-        { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
-        { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
-        { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
-        { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
-        { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
-        { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
-        { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
-        { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
-        { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
-        { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
-        { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
-        { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
-        { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
-        { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
-        { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
-        { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
-        { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
-        { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
-        { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
-        { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
-        { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
-        { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
-        { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
-        { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
-        { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
-        { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
-        { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
-        { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
-        { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
-        { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
-        { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
-        { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
-        { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
-        { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
-        { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
-        { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
-        { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
-        { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
-        { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
-        { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
-        { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
-        { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
-        { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
-        { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
-        { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
-        { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
-        { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
-        { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
-        { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
-        { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
-        { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 1240 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram241, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram242, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram243, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram244, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram245, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram246, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram247, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram248, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram249, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram250, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram251, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram252, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram253, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram254, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram255, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram256, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram257, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram258, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram259, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram260, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram261, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram262, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram263, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram264, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram265, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram266, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram267, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram268, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram269, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram270, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram271, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram272, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram273, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram274, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram275, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram276, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram277, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram278, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram279, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram280, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram281, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram282, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram283, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram284, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram285, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram286, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram287, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram288, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram289, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram290, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram291, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram292, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram293, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram294, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram295, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram296, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram297, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram298, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram299, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram300, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram301, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram302, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram303, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram304, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram305, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram306, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram307, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram308, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram309, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram310, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram311, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram312, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram313, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram314, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram315, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram316, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram317, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram318, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram319, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram320, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram321, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram322, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram323, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram324, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram325, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram326, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram327, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram328, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram329, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram330, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram331, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram332, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram333, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram334, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram335, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram336, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram337, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram338, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram339, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram340, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram341, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram342, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram343, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram344, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram345, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram346, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram347, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram348, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram349, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram350, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram351, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram352, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram353, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram354, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram355, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram356, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram357, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram358, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram359, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram360, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram361, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram362, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram363, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram364, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram365, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram366, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram367, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram368, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram369, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram370, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram371, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram372, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram373, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram374, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram375, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram376, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram377, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram378, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram379, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram380, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram381, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram382, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram383, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram384, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram385, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram386, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram387, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram388, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram389, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram390, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram391, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram392, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram393, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram394, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram395, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram396, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram397, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram398, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram399, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram400, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram401, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram402, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram403, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram404, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram405, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram406, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram407, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram408, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram409, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram410, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram411, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram412, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram413, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram414, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram415, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram416, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram417, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram418, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram419, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram420, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram421, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram422, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram423, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram424, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram425, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram426, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram427, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram428, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram429, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram430, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram431, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram432, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram433, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram434, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram435, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram436, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram437, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram438, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram439, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram440, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram441, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram442, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram443, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram444, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram445, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram446, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram447, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram448, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram449, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram450, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram451, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram452, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram453, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram454, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram455, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram456, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram457, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram458, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram459, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram460, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram461, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram462, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram463, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram464, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram465, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram466, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram467, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram468, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram469, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram470, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram471, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram472, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram473, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram474, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram475, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram476, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram477, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram478, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram479, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram480, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram481, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram482, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram483, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram484, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram485, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram486, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram487, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram488, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram489, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram490, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram491, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram492, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram493, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram494, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram495, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram496, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram497, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram498, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram499, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram500, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram501, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram502, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram503, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram504, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram505, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram506, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram507, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram508, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram509, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram510, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram511, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram512, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram513, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram514, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram515, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram516, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram517, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram518, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram519, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram520, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram521, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram522, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram523, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram524, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram525, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram526, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram527, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram528, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram529, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram530, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram531, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram532, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram533, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram534, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram535, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram536, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram537, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram538, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram539, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram540, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram541, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram542, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram543, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram544, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram545, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram546, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram547, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram548, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram549, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram550, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram551, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram552, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram553, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram554, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram555, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram556, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram557, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram558, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram559, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram560, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram561, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram562, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram563, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram564, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram565, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram566, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram567, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram568, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram569, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram570, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram571, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram572, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram573, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram574, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram575, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram576, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram577, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram578, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram579, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram580, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram581, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram582, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram583, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram584, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram585, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram586, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram587, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram588, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram589, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram590, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram591, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram592, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram593, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram594, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram595, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram596, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram597, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram598, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram599, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram600, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram601, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram602, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram603, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram604, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram605, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram606, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram607, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram608, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram609, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram610, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram611, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram612, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram613, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram614, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram615, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram616, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram617, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram618, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram619, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram620, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram621, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram622, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram623, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram624, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram625, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram626, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram627, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram628, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram629, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram630, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram631, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram632, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram633, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram634, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram635, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram636, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram637, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram638, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram639, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram640, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram641, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram642, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram643, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram644, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram645, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram646, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram647, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram648, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram649, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram650, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram651, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram652, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram653, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram654, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram655, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram656, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram657, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram658, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram659, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram660, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram661, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram662, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram663, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram664, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram665, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram666, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram667, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram668, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram669, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram670, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram671, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram672, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram673, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram674, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram675, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram676, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram677, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram678, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram679, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram680, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram681, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram682, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram683, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram684, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram685, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram686, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram687, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram688, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram689, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram690, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram691, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram692, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram693, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram694, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram695, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram696, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram697, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram698, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram699, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram700, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram701, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram702, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram703, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram704, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram705, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram706, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram707, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram708, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram709, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram710, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram711, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram712, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram713, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram714, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram715, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram716, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram717, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram718, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram719, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram720, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram721, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram722, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram723, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram724, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram725, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram726, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram727, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram728, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram729, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram730, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram731, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram732, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram733, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram734, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram735, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram736, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram737, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram738, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram739, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram740, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram741, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram742, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram743, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram744, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram745, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram746, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram747, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram748, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram749, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram750, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram751, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram752, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram753, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram754, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram755, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram756, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram757, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram758, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram759, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram760, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram761, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram762, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram763, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram764, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram765, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram766, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram767, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram768, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram769, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram770, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram771, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram772, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram773, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram774, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram775, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram776, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram777, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram778, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram779, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram780, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram781, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram782, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram783, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram784, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram785, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram786, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram787, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram788, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram789, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram790, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram791, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram792, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram793, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram794, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram795, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram796, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram797, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram798, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram799, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram800, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram801, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram802, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram803, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram804, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram805, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram806, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram807, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram808, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram809, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram810, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram811, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram812, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram813, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram814, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram815, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram816, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram817, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram818, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram819, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram820, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram821, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram822, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram823, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram824, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram825, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram826, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram827, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram828, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram829, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram830, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram831, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram832, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram833, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram834, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram835, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram836, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram837, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram838, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram839, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram840, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram841, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram842, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram843, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram844, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram845, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram846, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram847, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram848, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram849, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram850, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram851, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram852, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram853, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram854, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram855, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram856, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram857, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram858, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram859, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram860, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram861, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram862, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram863, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram864, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram865, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram866, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram867, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram868, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram869, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram870, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram871, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram872, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram873, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram874, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram875, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram876, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram877, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram878, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram879, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram880, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram881, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram882, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram883, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram884, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram885, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram886, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram887, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram888, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram889, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram890, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram891, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram892, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram893, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram894, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram895, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram896, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram897, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram898, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram899, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram900, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram901, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram902, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram903, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram904, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram905, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram906, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram907, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram908, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram909, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram910, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram911, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram912, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram913, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram914, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram915, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram916, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram917, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram918, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram919, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram920, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram921, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram922, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram923, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram924, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram925, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram926, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram927, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram928, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram929, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram930, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram931, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram932, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram933, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram934, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram935, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram936, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram937, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram938, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram939, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram940, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram941, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram942, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram943, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram944, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram945, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram946, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram947, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram948, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram949, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram950, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram951, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram952, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram953, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram954, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram955, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram956, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram957, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram958, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram959, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram960, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram961, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram962, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram963, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram964, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram965, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram966, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram967, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram968, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram969, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram970, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram971, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram972, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram973, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram974, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram975, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram976, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram977, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram978, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram979, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram980, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram981, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram982, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram983, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram984, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram985, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram986, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram987, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram988, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram989, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram990, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram991, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram992, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram993, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram994, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram995, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram996, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram997, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram998, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram999, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1000, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1001, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1002, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1003, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1004, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1005, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1006, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1007, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1008, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1009, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1010, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1011, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1012, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1013, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1014, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1015, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1016, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1017, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1018, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1019, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1020, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1021, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1022, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1023, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1024, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1025, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1026, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1027, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1028, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1029, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1030, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1031, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1032, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1033, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1034, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1035, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1036, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1037, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1038, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1039, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1040, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1041, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1042, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1043, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1044, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1045, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1046, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1047, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1048, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1049, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1050, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1051, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1052, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1053, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1054, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1055, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1056, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1057, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1058, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1059, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1060, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1061, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1062, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1063, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1064, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1065, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1066, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1067, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1068, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1069, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1070, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1071, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1072, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1073, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1074, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1075, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1076, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1077, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1078, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1079, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1080, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1081, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1082, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1083, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1084, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1085, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1086, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1087, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1088, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1089, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1090, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1091, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1092, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1093, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1094, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1095, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1096, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1097, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1098, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1099, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1124, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1125, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1126, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1127, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1128, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1129, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1130, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1131, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1132, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1133, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1134, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1135, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1136, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1137, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1138, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1139, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1140, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1141, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1142, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1143, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1144, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1145, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1146, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1147, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1148, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1149, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1150, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1151, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1152, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1153, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1154, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1155, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1156, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1157, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1158, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1159, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1160, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1161, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1162, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1163, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1164, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1165, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1166, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1167, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1168, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1169, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1170, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1171, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1172, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1173, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1174, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1175, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1176, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1177, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1178, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1179, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1180, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1181, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1182, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1183, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1184, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1185, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1186, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1187, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1188, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1189, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1190, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1191, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1192, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1193, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1194, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1195, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1196, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1197, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1198, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1199, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1200, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1201, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1202, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1203, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1204, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1205, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1206, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1207, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1208, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1209, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1210, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1211, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1212, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1213, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1214, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1215, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1216, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1217, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1218, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1219, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1220, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1221, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1222, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1223, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1224, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1225, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1226, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1227, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1228, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1229, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1230, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1231, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1232, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1233, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1234, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1235, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1236, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1237, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1238, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1239, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram1240, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram124( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram125( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram126( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram127( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram128( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram129( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram130( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram131( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram132( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram133( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram134( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram135( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram136( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram137( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram138( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram139( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram140( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram141( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram142( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram143( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram144( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram145( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram146( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram147( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram148( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram149( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram150( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram151( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram152( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram153( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram154( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram155( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram156( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram157( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram158( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram159( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram160( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram161( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram162( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram163( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram164( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram165( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram166( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram167( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram168( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram169( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram170( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram171( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram172( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram173( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram174( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram175( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram176( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram177( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram178( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram179( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram180( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram181( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram182( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram183( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram184( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram185( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram186( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram187( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram188( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram189( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram190( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram191( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram192( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram193( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram194( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram195( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram196( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram197( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram198( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram199( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram200( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram201( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram202( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram203( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram204( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram205( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram206( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram207( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram208( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram209( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram210( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram211( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram212( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram213( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram214( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram215( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram216( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram217( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram218( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram219( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram220( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram221( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram222( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram223( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram224( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram225( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram226( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram227( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram228( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram229( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram230( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram231( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram232( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram233( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram234( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram235( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram236( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram237( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram238( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram239( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram240( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram241( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram242( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram243( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram244( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram245( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram246( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram247( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram248( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram249( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram250( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram251( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram252( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram253( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram254( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram255( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram256( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram257( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram258( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram259( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram260( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram261( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram262( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram263( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram264( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram265( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram266( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram267( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram268( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram269( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram270( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram271( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram272( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram273( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram274( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram275( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram276( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram277( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram278( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram279( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram280( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram281( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram282( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram283( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram284( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram285( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram286( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram287( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram288( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram289( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram290( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram291( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram292( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram293( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram294( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram295( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram296( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram297( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram298( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram299( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram300( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram301( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram302( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram303( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram304( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram305( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram306( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram307( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram308( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram309( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram310( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram311( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram312( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram313( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram314( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram315( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram316( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram317( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram318( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram319( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram320( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram321( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram322( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram323( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram324( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram325( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram326( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram327( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram328( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram329( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram330( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram331( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram332( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram333( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram334( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram335( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram336( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram337( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram338( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram339( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram340( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram341( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram342( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram343( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram344( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram345( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram346( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram347( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram348( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram349( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram350( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram351( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram352( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram353( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram354( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram355( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram356( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram357( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram358( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram359( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram360( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram361( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram362( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram363( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram364( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram365( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram366( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram367( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram368( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram369( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram370( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram371( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram372( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram373( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram374( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram375( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram376( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram377( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram378( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram379( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram380( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram381( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram382( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram383( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram384( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram385( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram386( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram387( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram388( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram389( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram390( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram391( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram392( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram393( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram394( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram395( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram396( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram397( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram398( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram399( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram400( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram401( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram402( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram403( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram404( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram405( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram406( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram407( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram408( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram409( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram410( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram411( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram412( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram413( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram414( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram415( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram416( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram417( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram418( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram419( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram420( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram421( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram422( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram423( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram424( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram425( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram426( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram427( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram428( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram429( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram430( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram431( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram432( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram433( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram434( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram435( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram436( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram437( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram438( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram439( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram440( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram441( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram442( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram443( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram444( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram445( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram446( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram447( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram448( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram449( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram450( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram451( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram452( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram453( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram454( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram455( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram456( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram457( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram458( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram459( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram460( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram461( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram462( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram463( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram464( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram465( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram466( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram467( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram468( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram469( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram470( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram471( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram472( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram473( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram474( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram475( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram476( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram477( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram478( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram479( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram480( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram481( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram482( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram483( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram484( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram485( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram486( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram487( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram488( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram489( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram490( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram491( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram492( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram493( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram494( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram495( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram496( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram497( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram498( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram499( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram500( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram501( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram502( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram503( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram504( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram505( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram506( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram507( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram508( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram509( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram510( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram511( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram512( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram513( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram514( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram515( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram516( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram517( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram518( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram519( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram520( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram521( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram522( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram523( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram524( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram525( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram526( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram527( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram528( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram529( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram530( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram531( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram532( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram533( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram534( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram535( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram536( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram537( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram538( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram539( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram540( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram541( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram542( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram543( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram544( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram545( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram546( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram547( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram548( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram549( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram550( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram551( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram552( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram553( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram554( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram555( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram556( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram557( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram558( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram559( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram560( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram561( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram562( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram563( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram564( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram565( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram566( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram567( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram568( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram569( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram570( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram571( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram572( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram573( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram574( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram575( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram576( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram577( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram578( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram579( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram580( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram581( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram582( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram583( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram584( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram585( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram586( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram587( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram588( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram589( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram590( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram591( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram592( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram593( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram594( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram595( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram596( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram597( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram598( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram599( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram600( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram601( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram602( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram603( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram604( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram605( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram606( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram607( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram608( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram609( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram610( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram611( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram612( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram613( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram614( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram615( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram616( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram617( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram618( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram619( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram620( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram621( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram622( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram623( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram624( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram625( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram626( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram627( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram628( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram629( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram630( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram631( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram632( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram633( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram634( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram635( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram636( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram637( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram638( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram639( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram640( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram641( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram642( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram643( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram644( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram645( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram646( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram647( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram648( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram649( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram650( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram651( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram652( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram653( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram654( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram655( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram656( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram657( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram658( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram659( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram660( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram661( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram662( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram663( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram664( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram665( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram666( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram667( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram668( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram669( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram670( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram671( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram672( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram673( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram674( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram675( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram676( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram677( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram678( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram679( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram680( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram681( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram682( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram683( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram684( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram685( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram686( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram687( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram688( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram689( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram690( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram691( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram692( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram693( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram694( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram695( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram696( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram697( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram698( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram699( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram700( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram701( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram702( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram703( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram704( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram705( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram706( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram707( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram708( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram709( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram710( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram711( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram712( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram713( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram714( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram715( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram716( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram717( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram718( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram719( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram720( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram721( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram722( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram723( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram724( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram725( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram726( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram727( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram728( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram729( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram730( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram731( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram732( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram733( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram734( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram735( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram736( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram737( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram738( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram739( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram740( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram741( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram742( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram743( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram744( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram745( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram746( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram747( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram748( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram749( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram750( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram751( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram752( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram753( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram754( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram755( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram756( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram757( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram758( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram759( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram760( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram761( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram762( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram763( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram764( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram765( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram766( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram767( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram768( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram769( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram770( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram771( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram772( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram773( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram774( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram775( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram776( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram777( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram778( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram779( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram780( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram781( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram782( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram783( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram784( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram785( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram786( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram787( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram788( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram789( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram790( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram791( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram792( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram793( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram794( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram795( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram796( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram797( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram798( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram799( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram800( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram801( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram802( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram803( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram804( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram805( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram806( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram807( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram808( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram809( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram810( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram811( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram812( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram813( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram814( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram815( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram816( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram817( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram818( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram819( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram820( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram821( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram822( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram823( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram824( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram825( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram826( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram827( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram828( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram829( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram830( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram831( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram832( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram833( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram834( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram835( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram836( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram837( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram838( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram839( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram840( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram841( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram842( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram843( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram844( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram845( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram846( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram847( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram848( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram849( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram850( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram851( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram852( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram853( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram854( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram855( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram856( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram857( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram858( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram859( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram860( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram861( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram862( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram863( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram864( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram865( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram866( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram867( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram868( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram869( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram870( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram871( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram872( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram873( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram874( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram875( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram876( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram877( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram878( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram879( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram880( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram881( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram882( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram883( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram884( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram885( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram886( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram887( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram888( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram889( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram890( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram891( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram892( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram893( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram894( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram895( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram896( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram897( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram898( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram899( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram900( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram901( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram902( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram903( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram904( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram905( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram906( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram907( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram908( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram909( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram910( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram911( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram912( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram913( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram914( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram915( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram916( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram917( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram918( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram919( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram920( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram921( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram922( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram923( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram924( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram925( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram926( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram927( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram928( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram929( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram930( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram931( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram932( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram933( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram934( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram935( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram936( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram937( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram938( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram939( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram940( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram941( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram942( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram943( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram944( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram945( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram946( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram947( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram948( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram949( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram950( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram951( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram952( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram953( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram954( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram955( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram956( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram957( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram958( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram959( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram960( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram961( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram962( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram963( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram964( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram965( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram966( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram967( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram968( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram969( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram970( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram971( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram972( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram973( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram974( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram975( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram976( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram977( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram978( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram979( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram980( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram981( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram982( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram983( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram984( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram985( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram986( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram987( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram988( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram989( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram990( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram991( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram992( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram993( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram994( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram995( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram996( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram997( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram998( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram999( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1000( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1001( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1002( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1003( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1004( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1005( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1006( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1007( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1008( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1009( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1010( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1011( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1012( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1013( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1014( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1015( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1016( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1017( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1018( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1019( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1020( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1021( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1022( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1023( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1024( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1025( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1026( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1027( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1028( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1029( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1030( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1031( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1032( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1033( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1034( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1035( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1036( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1037( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1038( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1039( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1040( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1041( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1042( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1043( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1044( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1045( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1046( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1047( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1048( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1049( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1050( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1051( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1052( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1053( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1054( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1055( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1056( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1057( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1058( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1059( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1060( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1061( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1062( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1063( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1064( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1065( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1066( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1067( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1068( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1069( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1070( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1071( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1072( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1073( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1074( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1075( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1076( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1077( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1078( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1079( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1080( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1081( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1082( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1083( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1084( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1085( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1086( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1087( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1088( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1089( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1090( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1091( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1092( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1093( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1094( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1095( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1096( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1097( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1098( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1099( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1100( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1101( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1102( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1103( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1104( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1105( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1106( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1107( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1108( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1109( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1110( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1111( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1112( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1113( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1114( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1115( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1116( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1117( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1118( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1119( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1120( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1121( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1122( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1123( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1124( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1125( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1126( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1127( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1128( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1129( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1130( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1131( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1132( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1133( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1134( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1135( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1136( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1137( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1138( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1139( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1140( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1141( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1142( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1143( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1144( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1145( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1146( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1147( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1148( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1149( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1150( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1151( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1152( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1153( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1154( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1155( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1156( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1157( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1158( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1159( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1160( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1161( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1162( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1163( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1164( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1165( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1166( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1167( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1168( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1169( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1170( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1171( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1172( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1173( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1174( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1175( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1176( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1177( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1178( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1179( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1180( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1181( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1182( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1183( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1184( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1185( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1186( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1187( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1188( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1189( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1190( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1191( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1192( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1193( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1194( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1195( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1196( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1197( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1198( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1199( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1200( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1201( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1202( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1203( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1204( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1205( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1206( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1207( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1208( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1209( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1210( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1211( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1212( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1213( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1214( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1215( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1216( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1217( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1218( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1219( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1220( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1221( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1222( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1223( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1224( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1225( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1226( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1227( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1228( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1229( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1230( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1231( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1232( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1233( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1234( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1235( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1236( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1237( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1238( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1239( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram1240( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -32273,7 +3036,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -32309,6 +3076,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -32352,6 +3123,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -32454,26 +3229,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -32481,25 +3256,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -32512,7 +3302,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -32522,26 +3312,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -32552,17 +3343,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -32599,35 +3396,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -32644,13 +3623,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -32662,17 +3635,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -32698,93 +3674,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -32826,7 +3772,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -32849,7 +3795,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -32858,25 +3804,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -32886,8 +3838,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -32903,11 +3857,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -33009,14 +3964,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
index 2eb1e066ff..6b99d481e4 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 128; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 1240; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 120; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 121; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 7; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 1890;
     //static const int ncomb = 128; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc
new file mode 100644
index 0000000000..c027c38503
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.cc
@@ -0,0 +1,501 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324, 324 }; // 1-D array[120]
+
+  // The color matrix (initialize all array elements, with ncolor=120)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136 },
+    { -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116 },
+    { -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116 },
+    { 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44 },
+    { 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44 },
+    { 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514 },
+    { -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116 },
+    { 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442 },
+    { 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44 },
+    { -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28 },
+    { -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53 },
+    { -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62 },
+    { 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44 },
+    { -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53 },
+    { 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514 },
+    { -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62 },
+    { 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100 },
+    { 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10 },
+    { -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28 },
+    { -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62 },
+    { -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62 },
+    { 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10 },
+    { 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10 },
+    { -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1 },
+    { -512, 64, 64, -8, -8, -80, 64, -8, -8, 1, 1, 10, -8, 1, -80, 10, -71, -62, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116 },
+    { 64, -512, -8, -80, 64, -8, -8, 64, 1, 10, -8, 1, 1, 10, 10, -62, -62, 28, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442 },
+    { 64, -8, -512, 64, -80, -8, -8, 1, -80, 10, -71, -62, 64, -8, -8, 1, 1, 10, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442 },
+    { -8, -80, 64, -512, -8, 64, 1, 10, 10, -62, -62, 28, -8, 64, 1, 10, -8, 1, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134 },
+    { -8, 64, -80, -8, -512, 64, 1, -8, -71, -62, -80, 10, 10, 1, -62, 28, 10, -62, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134 },
+    { -80, -8, -8, 64, 64, -512, 10, 1, -62, 28, 10, -62, 1, -8, -71, -62, -80, 10, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505 },
+    { 64, -8, -8, 1, 1, 10, 640, -80, -80, 10, 10, 100, 568, -71, 496, -62, 505, 514, -71, 19, -62, -53, -134, -44, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44 },
+    { -8, 64, 1, 10, -8, 1, -80, 640, 10, 100, -80, 10, -71, 19, -62, -53, -134, -44, 568, -71, 496, -62, 505, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134 },
+    { -8, 1, -80, 10, -71, -62, -80, 10, 496, -62, 19, -53, 496, -62, -224, 28, -134, -44, 505, -134, -134, 442, 442, -116, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28 },
+    { 1, 10, 10, -62, -62, 28, 10, 100, -62, 514, -53, -44, -62, -53, 28, -44, 442, -116, 514, -44, -44, -116, -116, 136, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224 },
+    { 1, -8, -71, -62, -80, 10, 10, -80, 19, -53, 496, -62, 505, -134, -134, 442, 442, -116, 496, -62, -224, 28, -134, -44, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62 },
+    { 10, 1, -62, 28, 10, -62, 100, 10, -53, -44, -62, 514, 514, -44, -44, -116, -116, 136, -62, -53, 28, -44, 442, -116, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496 },
+    { -8, 1, 64, -8, 10, 1, 568, -71, 496, -62, 505, 514, 640, -80, -80, 10, 10, 100, 19, -71, -134, -44, -62, -53, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53 },
+    { 1, 10, -8, 64, 1, -8, -71, 19, -62, -53, -134, -44, -80, 640, 10, 100, -80, 10, -71, 568, 505, 514, 496, -62, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19 },
+    { -80, 10, -8, 1, -62, -71, 496, -62, -224, 28, -134, -44, -80, 10, 496, -62, 19, -53, -134, 505, 442, -116, -134, 442, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62 },
+    { 10, -62, 1, 10, 28, -62, -62, -53, 28, -44, 442, -116, 10, 100, -62, 514, -53, -44, -44, 514, -116, 136, -44, -116, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496 },
+    { -71, -62, 1, -8, 10, -80, 505, -134, -134, 442, 442, -116, 10, -80, 19, -53, 496, -62, -62, 496, -134, -44, -224, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10 },
+    { -62, 28, 10, 1, -62, 10, 514, -44, -44, -116, -116, 136, 100, 10, -53, -44, -62, 514, -53, -62, 442, -116, 28, -44, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80 },
+    { 1, -8, 10, 1, 64, -8, -71, 568, 505, 514, 496, -62, 19, -71, -134, -44, -62, -53, 640, -80, -80, 10, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62 },
+    { 10, 1, 1, -8, -8, 64, 19, -71, -134, -44, -62, -53, -71, 568, 505, 514, 496, -62, -80, 640, 10, 100, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71 },
+    { 10, -80, -62, -71, -8, 1, -62, 496, -134, -44, -224, 28, -134, 505, 442, -116, -134, 442, -80, 10, 496, -62, 19, -53, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10 },
+    { -62, 10, 28, -62, 1, 10, -53, -62, 442, -116, 28, -44, -44, 514, -116, 136, -44, -116, 10, 100, -62, 514, -53, -44, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80 },
+    { -62, -71, 10, -80, 1, -8, -134, 505, 442, -116, -134, 442, -62, 496, -134, -44, -224, 28, 10, -80, 19, -53, 496, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1 },
+    { 28, -62, -62, 10, 10, 1, -44, 514, -116, 136, -44, -116, -53, -62, 442, -116, 28, -44, 100, 10, -53, -44, -62, 514, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8 },
+    { 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44 },
+    { -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134 },
+    { -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53 },
+    { 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62 },
+    { 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19 },
+    { 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71 },
+    { 640, -80, -80, 10, 10, 100, 64, -8, -8, 1, 1, 10, 496, -62, 568, -71, 514, 505, -62, -53, -71, 19, -44, -134, 64, -8, -8, 1, 1, 10, -512, 64, 64, -8, -8, -80, -80, 10, -8, 1, -62, -71, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514 },
+    { -80, 640, 10, 100, -80, 10, -8, 64, 1, 10, -8, 1, -62, -53, -71, 19, -44, -134, 496, -62, 568, -71, 514, 505, -8, 64, 1, 10, -8, 1, 64, -512, -8, -80, 64, -8, 10, -62, 1, 10, 28, -62, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505 },
+    { -80, 10, 496, -62, 19, -53, -8, 1, -80, 10, -71, -62, -224, 28, 496, -62, -44, -134, -134, 442, 505, -134, -116, 442, -8, 1, -80, 10, -71, -62, 64, -8, -512, 64, -80, -8, -8, 1, 64, -8, 10, 1, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62 },
+    { 10, 100, -62, 514, -53, -44, 1, 10, 10, -62, -62, 28, 28, -44, -62, -53, -116, 442, -44, -116, 514, -44, 136, -116, 1, 10, 10, -62, -62, 28, -8, -80, 64, -512, -8, 64, 1, 10, -8, 64, 1, -8, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496 },
+    { 10, -80, 19, -53, 496, -62, 1, -8, -71, -62, -80, 10, -134, 442, 505, -134, -116, 442, -224, 28, 496, -62, -44, -134, 1, -8, -71, -62, -80, 10, -8, 64, -80, -8, -512, 64, -62, 28, 10, 1, -62, 10, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71 },
+    { 100, 10, -53, -44, -62, 514, 10, 1, -62, 28, 10, -62, -44, -116, 514, -44, 136, -116, 28, -44, -62, -53, -116, 442, 10, 1, -62, 28, 10, -62, -80, -8, -8, 64, 64, -512, -71, -62, 1, -8, 10, -80, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568 },
+    { 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100 },
+    { -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10 },
+    { 496, -62, -224, 28, -134, -44, -80, 10, -8, 1, -62, -71, 496, -62, -80, 10, -53, 19, 442, -116, -134, 505, 442, -134, 568, -71, 496, -62, 505, 514, -8, 1, 64, -8, 10, 1, -80, 10, 640, -80, 100, 10, -134, -44, 19, -71, -53, -62, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10 },
+    { -62, -53, 28, -44, 442, -116, 10, -62, 1, 10, 28, -62, -62, 514, 10, 100, -44, -53, -116, 136, -44, 514, -116, -44, -71, 19, -62, -53, -134, -44, 1, 10, -8, 64, 1, -8, 10, 100, -80, 640, 10, -80, 505, 514, -71, 568, -62, 496, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80 },
+    { 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80 },
+    { 514, -44, -44, -116, -116, 136, -62, 28, 10, 1, -62, 10, -53, -44, 100, 10, 514, -62, 442, -116, -53, -62, -44, 28, 505, -134, -134, 442, 442, -116, -71, -62, 1, -8, 10, -80, 19, -53, 10, -80, -62, 496, -134, -44, -62, 496, 28, -224, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640 },
+    { -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10 },
+    { 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1 },
+    { -62, 496, -134, -44, -224, 28, 10, -80, -62, -71, -8, 1, 442, -116, -134, 505, 442, -134, 496, -62, -80, 10, -53, 19, -71, 568, 505, 514, 496, -62, 1, -8, 10, 1, 64, -8, -134, -44, 19, -71, -53, -62, -80, 10, 640, -80, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1 },
+    { -53, -62, 442, -116, 28, -44, -62, 10, 28, -62, 1, 10, -116, 136, -44, 514, -116, -44, -62, 514, 10, 100, -44, -53, 19, -71, -134, -44, -62, -53, 10, 1, 1, -8, -8, 64, 505, 514, -71, 568, -62, 496, 10, 100, -80, 640, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8 },
+    { -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8 },
+    { -44, 514, -116, 136, -44, -116, 28, -62, -62, 10, 10, 1, 442, -116, -53, -62, -44, 28, -53, -44, 100, 10, 514, -62, -134, 505, 442, -116, -134, 442, -62, -71, 10, -80, 1, -8, -134, -44, -62, 496, 28, -224, 19, -53, 10, -80, -62, 496, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64 },
+    { -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28 },
+    { 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62 },
+    { -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62 },
+    { 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10 },
+    { -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10 },
+    { -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1 },
+    { -80, 10, 640, -80, 100, 10, 496, -62, 568, -71, 514, 505, 64, -8, -8, 1, 1, 10, -53, -62, -44, -134, -71, 19, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62 },
+    { 10, 100, -80, 640, 10, -80, -62, -53, -71, 19, -44, -134, -8, 64, 1, 10, -8, 1, -62, 496, 514, 505, 568, -71, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71 },
+    { 496, -62, -80, 10, -53, 19, -224, 28, 496, -62, -44, -134, -8, 1, -80, 10, -71, -62, 442, -134, -116, 442, 505, -134, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10 },
+    { -62, 514, 10, 100, -44, -53, 28, -44, -62, -53, -116, 442, 1, 10, 10, -62, -62, 28, -116, -44, 136, -116, 514, -44, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80 },
+    { 19, -53, 10, -80, -62, 496, -134, 442, 505, -134, -116, 442, 1, -8, -71, -62, -80, 10, 28, -224, -44, -134, 496, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1 },
+    { -53, -44, 100, 10, 514, -62, -44, -116, 514, -44, 136, -116, 10, 1, -62, 28, 10, -62, -44, 28, -116, 442, -62, -53, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8 },
+    { 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, -8, 1, 64, -8, 10, 1, -80, 10, -8, 1, -62, -71, -512, 64, 64, -8, -8, -80, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10 },
+    { -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, 1, 10, -8, 64, 1, -8, 10, -62, 1, 10, 28, -62, 64, -512, -8, -80, 64, -8, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1 },
+    { -224, 28, 496, -62, -44, -134, 496, -62, -80, 10, -53, 19, -80, 10, -8, 1, -62, -71, -116, 442, 442, -134, -134, 505, 496, -62, 568, -71, 514, 505, -80, 10, 640, -80, 100, 10, -8, 1, 64, -8, 10, 1, -44, -134, -53, -62, 19, -71, -80, 10, -8, 1, -62, -71, -8, 1, 64, -8, 10, 1, 64, -8, -512, 64, -80, -8, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1 },
+    { 28, -44, -62, -53, -116, 442, -62, 514, 10, 100, -44, -53, 10, -62, 1, 10, 28, -62, 136, -116, -116, -44, -44, 514, -62, -53, -71, 19, -44, -134, 10, 100, -80, 640, 10, -80, 1, 10, -8, 64, 1, -8, 514, 505, -62, 496, -71, 568, 10, -62, 1, 10, 28, -62, 1, 10, -8, 64, 1, -8, -8, -80, 64, -512, -8, 64, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8 },
+    { -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -71, -62, 1, -8, 10, -80, -62, 28, 10, 1, -62, 10, -8, 64, -80, -8, -512, 64, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8 },
+    { -44, -116, 514, -44, 136, -116, -53, -44, 100, 10, 514, -62, -62, 28, 10, 1, -62, 10, -116, 442, -44, 28, -53, -62, -134, 442, 505, -134, -116, 442, 19, -53, 10, -80, -62, 496, -71, -62, 1, -8, 10, -80, -44, -134, 28, -224, -62, 496, -62, 28, 10, 1, -62, 10, -71, -62, 1, -8, 10, -80, -80, -8, -8, 64, 64, -512, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64 },
+    { 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80 },
+    { -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8 },
+    { -134, -44, -62, 496, 28, -224, 442, -116, -134, 505, 442, -134, 10, -80, -62, -71, -8, 1, -62, 496, -53, 19, -80, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8 },
+    { 442, -116, -53, -62, -44, 28, -116, 136, -44, 514, -116, -44, -62, 10, 28, -62, 1, 10, 514, -62, -44, -53, 10, 100, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64 },
+    { 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 505, 514, -71, 568, -62, 496, -134, -44, 19, -71, -53, -62, 1, -8, 10, 1, 64, -8, 10, -80, 100, 10, 640, -80, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64 },
+    { -116, 136, -44, 514, -116, -44, 442, -116, -53, -62, -44, 28, 28, -62, -62, 10, 10, 1, -44, -53, 514, -62, 100, 10, 442, -116, -134, 505, 442, -134, -134, -44, -62, 496, 28, -224, -62, -71, 10, -80, 1, -8, -53, 19, -62, 496, 10, -80, -134, -44, 19, -71, -53, -62, 505, 514, -71, 568, -62, 496, 10, 1, 1, -8, -8, 64, 100, 10, 10, -80, -80, 640, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512 },
+    { 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 4096, -512, -512, 64, 64, 640, -512, 64, 64, -8, -8, -80, 64, -8, 640, -80, 568, 496, -8, -80, -80, 496, 496, -224 },
+    { 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, -512, 4096, 64, 640, -512, 64, 64, -512, -8, -80, 64, -8, -8, -80, -80, 496, 496, -224, 64, -8, 640, -80, 568, 496 },
+    { 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -512, 64, 4096, -512, 640, 64, 64, -8, 640, -80, 568, 496, -512, 64, 64, -8, -8, -80, -80, -8, 496, -224, -80, 496 },
+    { -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 64, 640, -512, 4096, 64, -512, -8, -80, -80, 496, 496, -224, 64, -512, -8, -80, 64, -8, -8, 64, 568, 496, 640, -80 },
+    { -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 64, -512, 640, 64, 4096, -512, -8, 64, 568, 496, 640, -80, -80, -8, 496, -224, -80, 496, -512, 64, 64, -8, -8, -80 },
+    { 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 640, 64, 64, -512, -512, 4096, -80, -8, 496, -224, -80, 496, -8, 64, 568, 496, 640, -80, 64, -512, -8, -80, 64, -8 },
+    { 10, -80, 100, 10, 640, -80, -62, 496, 514, 505, 568, -71, -53, -62, -44, -134, -71, 19, 64, -8, -8, 1, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -512, 64, 64, -8, -8, -80, 4096, -512, -512, 64, 64, 640, 640, -80, 64, -8, 496, 568, -80, 496, -8, -80, -224, 496 },
+    { 100, 10, 10, -80, -80, 640, -53, -62, -44, -134, -71, 19, -62, 496, 514, 505, 568, -71, -8, 64, 1, 10, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 64, -512, -8, -80, 64, -8, -512, 4096, 64, 640, -512, 64, -80, 496, -8, -80, -224, 496, 640, -80, 64, -8, 496, 568 },
+    { -62, 496, -53, 19, -80, 10, 28, -224, -44, -134, 496, -62, 442, -134, -116, 442, 505, -134, -8, 1, -80, 10, -71, -62, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 64, -8, 640, -80, 568, 496, -512, 64, 4096, -512, 640, 64, 64, -8, -512, 64, -80, -8, 496, -224, -80, -8, 496, -80 },
+    { 514, -62, -44, -53, 10, 100, -44, 28, -116, 442, -62, -53, -116, -44, 136, -116, 514, -44, 1, 10, 10, -62, -62, 28, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -8, -80, -80, 496, 496, -224, 64, 640, -512, 4096, 64, -512, -8, -80, 64, -512, -8, 64, 568, 496, -8, 64, -80, 640 },
+    { -53, 19, -62, 496, 10, -80, 442, -134, -116, 442, 505, -134, 28, -224, -44, -134, 496, -62, 1, -8, -71, -62, -80, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, -8, 64, 568, 496, 640, -80, 64, -512, 640, 64, 4096, -512, 496, -224, -80, -8, 496, -80, 64, -8, -512, 64, -80, -8 },
+    { -44, -53, 514, -62, 100, 10, -116, -44, 136, -116, 514, -44, -44, 28, -116, 442, -62, -53, 10, 1, -62, 28, 10, -62, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -80, -8, 496, -224, -80, 496, 640, 64, 64, -512, -512, 4096, 568, 496, -8, 64, -80, 640, -8, -80, 64, -512, -8, 64 },
+    { -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 64, -8, -512, 64, -80, -8, 640, -80, 64, -8, 496, 568, 4096, -512, -512, 64, 64, 640, 496, -80, -224, 496, -8, -80 },
+    { -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -8, -80, 64, -512, -8, 64, -80, 496, -8, -80, -224, 496, -512, 4096, 64, 640, -512, 64, -80, 640, 496, 568, 64, -8 },
+    { 28, -224, -44, -134, 496, -62, -62, 496, -53, 19, -80, 10, -116, 442, 442, -134, -134, 505, -80, 10, -8, 1, -62, -71, -62, 496, 514, 505, 568, -71, 10, -80, 100, 10, 640, -80, -44, -134, -53, -62, 19, -71, -8, 1, 64, -8, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 640, -80, 64, -8, 496, 568, 64, -8, -512, 64, -80, -8, -512, 64, 4096, -512, 640, 64, -224, 496, 496, -80, -80, -8 },
+    { -44, 28, -116, 442, -62, -53, 514, -62, -44, -53, 10, 100, 136, -116, -116, -44, -44, 514, 10, -62, 1, 10, 28, -62, -53, -62, -44, -134, -71, 19, 100, 10, 10, -80, -80, 640, 514, 505, -62, 496, -71, 568, 1, 10, -8, 64, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -80, 496, -8, -80, -224, 496, -8, -80, 64, -512, -8, 64, 64, 640, -512, 4096, 64, -512, 496, 568, -80, 640, -8, 64 },
+    { 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 568, 496, -8, 64, -80, 640, 496, -224, -80, -8, 496, -80, 64, -512, 640, 64, 4096, -512, -8, 64, -80, -8, -512, 64 },
+    { -116, -44, 136, -116, 514, -44, -44, -53, 514, -62, 100, 10, -116, 442, -44, 28, -53, -62, -62, 28, 10, 1, -62, 10, 442, -134, -116, 442, 505, -134, -53, 19, -62, 496, 10, -80, -44, -134, 28, -224, -62, 496, -71, -62, 1, -8, 10, -80, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 496, -224, -80, -8, 496, -80, 568, 496, -8, 64, -80, 640, 640, 64, 64, -512, -512, 4096, -80, -8, -8, 64, 64, -512 },
+    { 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, 1, -8, 10, 1, 64, -8, 10, -80, -62, -71, -8, 1, -62, 10, 28, -62, 1, 10, -512, 64, 64, -8, -8, -80, -8, 64, -80, -8, -512, 64, -80, 640, 496, 568, 64, -8, 496, -80, -224, 496, -8, -80, 4096, -512, -512, 64, 64, 640 },
+    { -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 10, 1, 1, -8, -8, 64, -62, 10, 28, -62, 1, 10, 10, -80, -62, -71, -8, 1, 64, -512, -8, -80, 64, -8, -80, -8, -8, 64, 64, -512, 496, -80, -224, 496, -8, -80, -80, 640, 496, 568, 64, -8, -512, 4096, 64, 640, -512, 64 },
+    { -44, -134, 28, -224, -62, 496, -116, 442, 442, -134, -134, 505, -62, 496, -53, 19, -80, 10, 10, -80, -62, -71, -8, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 10, -80, -62, -71, -8, 1, 1, -8, 10, 1, 64, -8, 28, -62, -62, 10, 10, 1, 64, -8, -512, 64, -80, -8, -80, 640, 496, 568, 64, -8, -8, 64, -80, -8, -512, 64, -224, 496, 496, -80, -80, -8, -512, 64, 4096, -512, 640, 64 },
+    { -116, 442, -44, 28, -53, -62, 136, -116, -116, -44, -44, 514, 514, -62, -44, -53, 10, 100, -62, 10, 28, -62, 1, 10, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -62, 10, 28, -62, 1, 10, 10, 1, 1, -8, -8, 64, -62, -71, 10, -80, 1, -8, -8, -80, 64, -512, -8, 64, 496, -80, -224, 496, -8, -80, -80, -8, -8, 64, 64, -512, 496, 568, -80, 640, -8, 64, 64, 640, -512, 4096, 64, -512 },
+    { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 },
+    { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagrams.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagrams.h
new file mode 100644
index 0000000000..a35fbac6a1
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/diagrams.h
@@ -0,0 +1,51386 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 1240 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][6], +1, w_fp[6], 6 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[9] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 1
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 1240 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 2
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 1240 ***
+    // Wavefunction(s) for diagram number 3
+    // (none)
+    // Amplitude(s) for diagram number 3
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[9], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 1240 ***
+    // Wavefunction(s) for diagram number 4
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[13] );
+    // Amplitude(s) for diagram number 4
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 1240 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 1240 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[12], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 1240 ***
+    // Wavefunction(s) for diagram number 7
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 7
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 1240 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 1240 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[14], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 1240 ***
+    // Wavefunction(s) for diagram number 10
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[16] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
+    // Amplitude(s) for diagram number 10
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[15], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 1240 ***
+    // Wavefunction(s) for diagram number 11
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[18] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[19] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[20] );
+    // Amplitude(s) for diagram number 11
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[18], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 1240 ***
+    // Wavefunction(s) for diagram number 12
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 12
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 1240 ***
+    // Wavefunction(s) for diagram number 13
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 13
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[24], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 1240 ***
+    // Wavefunction(s) for diagram number 14
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[25] );
+    // Amplitude(s) for diagram number 14
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 1240 ***
+    // Wavefunction(s) for diagram number 15
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[26] );
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[26], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 1240 ***
+    // Wavefunction(s) for diagram number 16
+    // (none)
+    // Amplitude(s) for diagram number 16
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 1240 ***
+    // Wavefunction(s) for diagram number 17
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[27] );
+    // Amplitude(s) for diagram number 17
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[27], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 1240 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 1240 ***
+    // Wavefunction(s) for diagram number 19
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[28] );
+    // Amplitude(s) for diagram number 19
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[28], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 1240 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 1240 ***
+    // Wavefunction(s) for diagram number 21
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
+    // Amplitude(s) for diagram number 21
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[4], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 1240 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 1240 ***
+    // Wavefunction(s) for diagram number 23
+    // (none)
+    // Amplitude(s) for diagram number 23
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 1240 ***
+    // Wavefunction(s) for diagram number 24
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[7], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[25] );
+    // Amplitude(s) for diagram number 24
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 1240 ***
+    // Wavefunction(s) for diagram number 25
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[30] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[31] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[32] );
+    // Amplitude(s) for diagram number 25
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[30], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[31], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[8], w_fp[32], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 1240 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[33] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[35] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 1240 ***
+    // Wavefunction(s) for diagram number 27
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[36] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 1240 ***
+    // Wavefunction(s) for diagram number 28
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[37] );
+    // Amplitude(s) for diagram number 28
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 1240 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 1240 ***
+    // Wavefunction(s) for diagram number 30
+    // (none)
+    // Amplitude(s) for diagram number 30
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 1240 ***
+    // Wavefunction(s) for diagram number 31
+    // (none)
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 1240 ***
+    // Wavefunction(s) for diagram number 32
+    // (none)
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 1240 ***
+    // Wavefunction(s) for diagram number 33
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[38] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 1240 ***
+    // Wavefunction(s) for diagram number 34
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
+    // Amplitude(s) for diagram number 34
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 1240 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 1240 ***
+    // Wavefunction(s) for diagram number 36
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[41] );
+    // Amplitude(s) for diagram number 36
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram37( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 37 OF 1240 ***
+    // Wavefunction(s) for diagram number 37
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[42] );
+    // Amplitude(s) for diagram number 37
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram38( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 38 OF 1240 ***
+    // Wavefunction(s) for diagram number 38
+    // (none)
+    // Amplitude(s) for diagram number 38
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram39( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 39 OF 1240 ***
+    // Wavefunction(s) for diagram number 39
+    // (none)
+    // Amplitude(s) for diagram number 39
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram40( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 40 OF 1240 ***
+    // Wavefunction(s) for diagram number 40
+    // (none)
+    // Amplitude(s) for diagram number 40
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram41( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 41 OF 1240 ***
+    // Wavefunction(s) for diagram number 41
+    // (none)
+    // Amplitude(s) for diagram number 41
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram42( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 42 OF 1240 ***
+    // Wavefunction(s) for diagram number 42
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[39] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[43] );
+    // Amplitude(s) for diagram number 42
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram43( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 43 OF 1240 ***
+    // Wavefunction(s) for diagram number 43
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[44] );
+    // Amplitude(s) for diagram number 43
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram44( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 44 OF 1240 ***
+    // Wavefunction(s) for diagram number 44
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[45] );
+    // Amplitude(s) for diagram number 44
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram45( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 45 OF 1240 ***
+    // Wavefunction(s) for diagram number 45
+    // (none)
+    // Amplitude(s) for diagram number 45
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram46( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 46 OF 1240 ***
+    // Wavefunction(s) for diagram number 46
+    // (none)
+    // Amplitude(s) for diagram number 46
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram47( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 47 OF 1240 ***
+    // Wavefunction(s) for diagram number 47
+    // (none)
+    // Amplitude(s) for diagram number 47
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram48( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 48 OF 1240 ***
+    // Wavefunction(s) for diagram number 48
+    // (none)
+    // Amplitude(s) for diagram number 48
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram49( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 49 OF 1240 ***
+    // Wavefunction(s) for diagram number 49
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[46] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
+    // Amplitude(s) for diagram number 49
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram50( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 50 OF 1240 ***
+    // Wavefunction(s) for diagram number 50
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
+    // Amplitude(s) for diagram number 50
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram51( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 51 OF 1240 ***
+    // Wavefunction(s) for diagram number 51
+    // (none)
+    // Amplitude(s) for diagram number 51
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram52( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 52 OF 1240 ***
+    // Wavefunction(s) for diagram number 52
+    // (none)
+    // Amplitude(s) for diagram number 52
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram53( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 53 OF 1240 ***
+    // Wavefunction(s) for diagram number 53
+    // (none)
+    // Amplitude(s) for diagram number 53
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram54( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 54 OF 1240 ***
+    // Wavefunction(s) for diagram number 54
+    // (none)
+    // Amplitude(s) for diagram number 54
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram55( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 55 OF 1240 ***
+    // Wavefunction(s) for diagram number 55
+    // (none)
+    // Amplitude(s) for diagram number 55
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram56( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 56 OF 1240 ***
+    // Wavefunction(s) for diagram number 56
+    // (none)
+    // Amplitude(s) for diagram number 56
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram57( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 57 OF 1240 ***
+    // Wavefunction(s) for diagram number 57
+    // (none)
+    // Amplitude(s) for diagram number 57
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram58( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 58 OF 1240 ***
+    // Wavefunction(s) for diagram number 58
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[47] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[49] );
+    // Amplitude(s) for diagram number 58
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram59( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 59 OF 1240 ***
+    // Wavefunction(s) for diagram number 59
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[50] );
+    // Amplitude(s) for diagram number 59
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram60( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 60 OF 1240 ***
+    // Wavefunction(s) for diagram number 60
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[51] );
+    // Amplitude(s) for diagram number 60
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram61( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 61 OF 1240 ***
+    // Wavefunction(s) for diagram number 61
+    // (none)
+    // Amplitude(s) for diagram number 61
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram62( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 62 OF 1240 ***
+    // Wavefunction(s) for diagram number 62
+    // (none)
+    // Amplitude(s) for diagram number 62
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram63( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 63 OF 1240 ***
+    // Wavefunction(s) for diagram number 63
+    // (none)
+    // Amplitude(s) for diagram number 63
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram64( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 64 OF 1240 ***
+    // Wavefunction(s) for diagram number 64
+    // (none)
+    // Amplitude(s) for diagram number 64
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram65( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 65 OF 1240 ***
+    // Wavefunction(s) for diagram number 65
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+    // Amplitude(s) for diagram number 65
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram66( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 66 OF 1240 ***
+    // Wavefunction(s) for diagram number 66
+    // (none)
+    // Amplitude(s) for diagram number 66
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram67( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 67 OF 1240 ***
+    // Wavefunction(s) for diagram number 67
+    // (none)
+    // Amplitude(s) for diagram number 67
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram68( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 68 OF 1240 ***
+    // Wavefunction(s) for diagram number 68
+    // (none)
+    // Amplitude(s) for diagram number 68
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram69( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 69 OF 1240 ***
+    // Wavefunction(s) for diagram number 69
+    // (none)
+    // Amplitude(s) for diagram number 69
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram70( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 70 OF 1240 ***
+    // Wavefunction(s) for diagram number 70
+    // (none)
+    // Amplitude(s) for diagram number 70
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram71( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 71 OF 1240 ***
+    // Wavefunction(s) for diagram number 71
+    // (none)
+    // Amplitude(s) for diagram number 71
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram72( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 72 OF 1240 ***
+    // Wavefunction(s) for diagram number 72
+    // (none)
+    // Amplitude(s) for diagram number 72
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram73( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 73 OF 1240 ***
+    // Wavefunction(s) for diagram number 73
+    // (none)
+    // Amplitude(s) for diagram number 73
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram74( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 74 OF 1240 ***
+    // Wavefunction(s) for diagram number 74
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[7], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
+    // Amplitude(s) for diagram number 74
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram75( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 75 OF 1240 ***
+    // Wavefunction(s) for diagram number 75
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[53] );
+    // Amplitude(s) for diagram number 75
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram76( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 76 OF 1240 ***
+    // Wavefunction(s) for diagram number 76
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[54] );
+    // Amplitude(s) for diagram number 76
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram77( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 77 OF 1240 ***
+    // Wavefunction(s) for diagram number 77
+    // (none)
+    // Amplitude(s) for diagram number 77
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram78( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 78 OF 1240 ***
+    // Wavefunction(s) for diagram number 78
+    // (none)
+    // Amplitude(s) for diagram number 78
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram79( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 79 OF 1240 ***
+    // Wavefunction(s) for diagram number 79
+    // (none)
+    // Amplitude(s) for diagram number 79
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram80( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 80 OF 1240 ***
+    // Wavefunction(s) for diagram number 80
+    // (none)
+    // Amplitude(s) for diagram number 80
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram81( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 81 OF 1240 ***
+    // Wavefunction(s) for diagram number 81
+    // (none)
+    // Amplitude(s) for diagram number 81
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[52], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram82( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 82 OF 1240 ***
+    // Wavefunction(s) for diagram number 82
+    // (none)
+    // Amplitude(s) for diagram number 82
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram83( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 83 OF 1240 ***
+    // Wavefunction(s) for diagram number 83
+    // (none)
+    // Amplitude(s) for diagram number 83
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[25], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram84( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 84 OF 1240 ***
+    // Wavefunction(s) for diagram number 84
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[25] );
+    // Amplitude(s) for diagram number 84
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram85( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 85 OF 1240 ***
+    // Wavefunction(s) for diagram number 85
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[48] );
+    // Amplitude(s) for diagram number 85
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram86( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 86 OF 1240 ***
+    // Wavefunction(s) for diagram number 86
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 86
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram87( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 87 OF 1240 ***
+    // Wavefunction(s) for diagram number 87
+    // (none)
+    // Amplitude(s) for diagram number 87
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram88( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 88 OF 1240 ***
+    // Wavefunction(s) for diagram number 88
+    // (none)
+    // Amplitude(s) for diagram number 88
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram89( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 89 OF 1240 ***
+    // Wavefunction(s) for diagram number 89
+    // (none)
+    // Amplitude(s) for diagram number 89
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram90( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 90 OF 1240 ***
+    // Wavefunction(s) for diagram number 90
+    // (none)
+    // Amplitude(s) for diagram number 90
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram91( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 91 OF 1240 ***
+    // Wavefunction(s) for diagram number 91
+    // (none)
+    // Amplitude(s) for diagram number 91
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[52], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram92( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 92 OF 1240 ***
+    // Wavefunction(s) for diagram number 92
+    // (none)
+    // Amplitude(s) for diagram number 92
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram93( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 93 OF 1240 ***
+    // Wavefunction(s) for diagram number 93
+    // (none)
+    // Amplitude(s) for diagram number 93
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[28], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram94( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 94 OF 1240 ***
+    // Wavefunction(s) for diagram number 94
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[28] );
+    // Amplitude(s) for diagram number 94
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram95( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 95 OF 1240 ***
+    // Wavefunction(s) for diagram number 95
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[40] );
+    // Amplitude(s) for diagram number 95
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram96( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 96 OF 1240 ***
+    // Wavefunction(s) for diagram number 96
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[20] );
+    // Amplitude(s) for diagram number 96
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram97( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 97 OF 1240 ***
+    // Wavefunction(s) for diagram number 97
+    // (none)
+    // Amplitude(s) for diagram number 97
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram98( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 98 OF 1240 ***
+    // Wavefunction(s) for diagram number 98
+    // (none)
+    // Amplitude(s) for diagram number 98
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram99( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 99 OF 1240 ***
+    // Wavefunction(s) for diagram number 99
+    // (none)
+    // Amplitude(s) for diagram number 99
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram100( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 100 OF 1240 ***
+    // Wavefunction(s) for diagram number 100
+    // (none)
+    // Amplitude(s) for diagram number 100
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram101( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 101 OF 1240 ***
+    // Wavefunction(s) for diagram number 101
+    // (none)
+    // Amplitude(s) for diagram number 101
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[52], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram102( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 102 OF 1240 ***
+    // Wavefunction(s) for diagram number 102
+    // (none)
+    // Amplitude(s) for diagram number 102
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[42], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram103( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 103 OF 1240 ***
+    // Wavefunction(s) for diagram number 103
+    // (none)
+    // Amplitude(s) for diagram number 103
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[26], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram104( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 104 OF 1240 ***
+    // Wavefunction(s) for diagram number 104
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[26] );
+    // Amplitude(s) for diagram number 104
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[52], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram105( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 105 OF 1240 ***
+    // Wavefunction(s) for diagram number 105
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[42] );
+    // Amplitude(s) for diagram number 105
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram106( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 106 OF 1240 ***
+    // Wavefunction(s) for diagram number 106
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    // Amplitude(s) for diagram number 106
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram107( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 107 OF 1240 ***
+    // Wavefunction(s) for diagram number 107
+    // (none)
+    // Amplitude(s) for diagram number 107
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram108( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 108 OF 1240 ***
+    // Wavefunction(s) for diagram number 108
+    // (none)
+    // Amplitude(s) for diagram number 108
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram109( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 109 OF 1240 ***
+    // Wavefunction(s) for diagram number 109
+    // (none)
+    // Amplitude(s) for diagram number 109
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram110( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 110 OF 1240 ***
+    // Wavefunction(s) for diagram number 110
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 110
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[52], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram111( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 111 OF 1240 ***
+    // Wavefunction(s) for diagram number 111
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 111
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram112( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 112 OF 1240 ***
+    // Wavefunction(s) for diagram number 112
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
+    // Amplitude(s) for diagram number 112
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram113( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 113 OF 1240 ***
+    // Wavefunction(s) for diagram number 113
+    // (none)
+    // Amplitude(s) for diagram number 113
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram114( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 114 OF 1240 ***
+    // Wavefunction(s) for diagram number 114
+    // (none)
+    // Amplitude(s) for diagram number 114
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram115( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 115 OF 1240 ***
+    // Wavefunction(s) for diagram number 115
+    // (none)
+    // Amplitude(s) for diagram number 115
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram116( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 116 OF 1240 ***
+    // Wavefunction(s) for diagram number 116
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 116
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[52], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram117( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 117 OF 1240 ***
+    // Wavefunction(s) for diagram number 117
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[19] );
+    // Amplitude(s) for diagram number 117
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram118( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 118 OF 1240 ***
+    // Wavefunction(s) for diagram number 118
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[18] );
+    // Amplitude(s) for diagram number 118
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram119( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 119 OF 1240 ***
+    // Wavefunction(s) for diagram number 119
+    // (none)
+    // Amplitude(s) for diagram number 119
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram120( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 120 OF 1240 ***
+    // Wavefunction(s) for diagram number 120
+    // (none)
+    // Amplitude(s) for diagram number 120
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram121( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 121 OF 1240 ***
+    // Wavefunction(s) for diagram number 121
+    // (none)
+    // Amplitude(s) for diagram number 121
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram122( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 122 OF 1240 ***
+    // Wavefunction(s) for diagram number 122
+    // (none)
+    // Amplitude(s) for diagram number 122
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[52], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram123( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 123 OF 1240 ***
+    // Wavefunction(s) for diagram number 123
+    // (none)
+    // Amplitude(s) for diagram number 123
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram124( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 124 OF 1240 ***
+    // Wavefunction(s) for diagram number 124
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[52] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 124
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram125( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 125 OF 1240 ***
+    // Wavefunction(s) for diagram number 125
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 125
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram126( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 126 OF 1240 ***
+    // Wavefunction(s) for diagram number 126
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[55] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[56] );
+    // Amplitude(s) for diagram number 126
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram127( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 127 OF 1240 ***
+    // Wavefunction(s) for diagram number 127
+    // (none)
+    // Amplitude(s) for diagram number 127
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram128( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 128 OF 1240 ***
+    // Wavefunction(s) for diagram number 128
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[57] );
+    // Amplitude(s) for diagram number 128
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram129( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 129 OF 1240 ***
+    // Wavefunction(s) for diagram number 129
+    // (none)
+    // Amplitude(s) for diagram number 129
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram130( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 130 OF 1240 ***
+    // Wavefunction(s) for diagram number 130
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[58] );
+    // Amplitude(s) for diagram number 130
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram131( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 131 OF 1240 ***
+    // Wavefunction(s) for diagram number 131
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+    // Amplitude(s) for diagram number 131
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram132( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 132 OF 1240 ***
+    // Wavefunction(s) for diagram number 132
+    // (none)
+    // Amplitude(s) for diagram number 132
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[57], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram133( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 133 OF 1240 ***
+    // Wavefunction(s) for diagram number 133
+    // (none)
+    // Amplitude(s) for diagram number 133
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram134( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 134 OF 1240 ***
+    // Wavefunction(s) for diagram number 134
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    // Amplitude(s) for diagram number 134
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[60], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram135( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 135 OF 1240 ***
+    // Wavefunction(s) for diagram number 135
+    // (none)
+    // Amplitude(s) for diagram number 135
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[55], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram136( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 136 OF 1240 ***
+    // Wavefunction(s) for diagram number 136
+    // (none)
+    // Amplitude(s) for diagram number 136
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[58], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram137( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 137 OF 1240 ***
+    // Wavefunction(s) for diagram number 137
+    // (none)
+    // Amplitude(s) for diagram number 137
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[9], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram138( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 138 OF 1240 ***
+    // Wavefunction(s) for diagram number 138
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
+    // Amplitude(s) for diagram number 138
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[58], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram139( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 139 OF 1240 ***
+    // Wavefunction(s) for diagram number 139
+    // (none)
+    // Amplitude(s) for diagram number 139
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[34], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram140( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 140 OF 1240 ***
+    // Wavefunction(s) for diagram number 140
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[61] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[62] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[63] );
+    // Amplitude(s) for diagram number 140
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram141( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 141 OF 1240 ***
+    // Wavefunction(s) for diagram number 141
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[64] );
+    // Amplitude(s) for diagram number 141
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram142( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 142 OF 1240 ***
+    // Wavefunction(s) for diagram number 142
+    // (none)
+    // Amplitude(s) for diagram number 142
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram143( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 143 OF 1240 ***
+    // Wavefunction(s) for diagram number 143
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[65] );
+    // Amplitude(s) for diagram number 143
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram144( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 144 OF 1240 ***
+    // Wavefunction(s) for diagram number 144
+    // (none)
+    // Amplitude(s) for diagram number 144
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram145( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 145 OF 1240 ***
+    // Wavefunction(s) for diagram number 145
+    // (none)
+    // Amplitude(s) for diagram number 145
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram146( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 146 OF 1240 ***
+    // Wavefunction(s) for diagram number 146
+    // (none)
+    // Amplitude(s) for diagram number 146
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram147( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 147 OF 1240 ***
+    // Wavefunction(s) for diagram number 147
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
+    // Amplitude(s) for diagram number 147
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[66], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram148( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 148 OF 1240 ***
+    // Wavefunction(s) for diagram number 148
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[67] );
+    // Amplitude(s) for diagram number 148
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram149( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 149 OF 1240 ***
+    // Wavefunction(s) for diagram number 149
+    // (none)
+    // Amplitude(s) for diagram number 149
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[57], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram150( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 150 OF 1240 ***
+    // Wavefunction(s) for diagram number 150
+    // (none)
+    // Amplitude(s) for diagram number 150
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[66], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram151( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 151 OF 1240 ***
+    // Wavefunction(s) for diagram number 151
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 151
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram152( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 152 OF 1240 ***
+    // Wavefunction(s) for diagram number 152
+    // (none)
+    // Amplitude(s) for diagram number 152
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[55], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram153( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 153 OF 1240 ***
+    // Wavefunction(s) for diagram number 153
+    // (none)
+    // Amplitude(s) for diagram number 153
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram154( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 154 OF 1240 ***
+    // Wavefunction(s) for diagram number 154
+    // (none)
+    // Amplitude(s) for diagram number 154
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram155( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 155 OF 1240 ***
+    // Wavefunction(s) for diagram number 155
+    // (none)
+    // Amplitude(s) for diagram number 155
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[58], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram156( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 156 OF 1240 ***
+    // Wavefunction(s) for diagram number 156
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[66] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[69] );
+    // Amplitude(s) for diagram number 156
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram157( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 157 OF 1240 ***
+    // Wavefunction(s) for diagram number 157
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[70] );
+    // Amplitude(s) for diagram number 157
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram158( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 158 OF 1240 ***
+    // Wavefunction(s) for diagram number 158
+    // (none)
+    // Amplitude(s) for diagram number 158
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram159( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 159 OF 1240 ***
+    // Wavefunction(s) for diagram number 159
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    // Amplitude(s) for diagram number 159
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram160( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 160 OF 1240 ***
+    // Wavefunction(s) for diagram number 160
+    // (none)
+    // Amplitude(s) for diagram number 160
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram161( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 161 OF 1240 ***
+    // Wavefunction(s) for diagram number 161
+    // (none)
+    // Amplitude(s) for diagram number 161
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram162( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 162 OF 1240 ***
+    // Wavefunction(s) for diagram number 162
+    // (none)
+    // Amplitude(s) for diagram number 162
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram163( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 163 OF 1240 ***
+    // Wavefunction(s) for diagram number 163
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
+    // Amplitude(s) for diagram number 163
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[72], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram164( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 164 OF 1240 ***
+    // Wavefunction(s) for diagram number 164
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], COUPs[1], 1.0, 0., 0., w_fp[73] );
+    // Amplitude(s) for diagram number 164
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram165( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 165 OF 1240 ***
+    // Wavefunction(s) for diagram number 165
+    // (none)
+    // Amplitude(s) for diagram number 165
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[57], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram166( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 166 OF 1240 ***
+    // Wavefunction(s) for diagram number 166
+    // (none)
+    // Amplitude(s) for diagram number 166
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[72], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram167( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 167 OF 1240 ***
+    // Wavefunction(s) for diagram number 167
+    // (none)
+    // Amplitude(s) for diagram number 167
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram168( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 168 OF 1240 ***
+    // Wavefunction(s) for diagram number 168
+    // (none)
+    // Amplitude(s) for diagram number 168
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[9], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram169( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 169 OF 1240 ***
+    // Wavefunction(s) for diagram number 169
+    // (none)
+    // Amplitude(s) for diagram number 169
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram170( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 170 OF 1240 ***
+    // Wavefunction(s) for diagram number 170
+    // (none)
+    // Amplitude(s) for diagram number 170
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram171( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 171 OF 1240 ***
+    // Wavefunction(s) for diagram number 171
+    // (none)
+    // Amplitude(s) for diagram number 171
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram172( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 172 OF 1240 ***
+    // Wavefunction(s) for diagram number 172
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[72] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[74] );
+    // Amplitude(s) for diagram number 172
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram173( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 173 OF 1240 ***
+    // Wavefunction(s) for diagram number 173
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[75] );
+    // Amplitude(s) for diagram number 173
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram174( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 174 OF 1240 ***
+    // Wavefunction(s) for diagram number 174
+    // (none)
+    // Amplitude(s) for diagram number 174
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram175( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 175 OF 1240 ***
+    // Wavefunction(s) for diagram number 175
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[76] );
+    // Amplitude(s) for diagram number 175
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram176( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 176 OF 1240 ***
+    // Wavefunction(s) for diagram number 176
+    // (none)
+    // Amplitude(s) for diagram number 176
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram177( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 177 OF 1240 ***
+    // Wavefunction(s) for diagram number 177
+    // (none)
+    // Amplitude(s) for diagram number 177
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram178( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 178 OF 1240 ***
+    // Wavefunction(s) for diagram number 178
+    // (none)
+    // Amplitude(s) for diagram number 178
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram179( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 179 OF 1240 ***
+    // Wavefunction(s) for diagram number 179
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    // Amplitude(s) for diagram number 179
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram180( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 180 OF 1240 ***
+    // Wavefunction(s) for diagram number 180
+    // (none)
+    // Amplitude(s) for diagram number 180
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram181( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 181 OF 1240 ***
+    // Wavefunction(s) for diagram number 181
+    // (none)
+    // Amplitude(s) for diagram number 181
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[55], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram182( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 182 OF 1240 ***
+    // Wavefunction(s) for diagram number 182
+    // (none)
+    // Amplitude(s) for diagram number 182
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram183( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 183 OF 1240 ***
+    // Wavefunction(s) for diagram number 183
+    // (none)
+    // Amplitude(s) for diagram number 183
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram184( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 184 OF 1240 ***
+    // Wavefunction(s) for diagram number 184
+    // (none)
+    // Amplitude(s) for diagram number 184
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[9], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram185( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 185 OF 1240 ***
+    // Wavefunction(s) for diagram number 185
+    // (none)
+    // Amplitude(s) for diagram number 185
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram186( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 186 OF 1240 ***
+    // Wavefunction(s) for diagram number 186
+    // (none)
+    // Amplitude(s) for diagram number 186
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram187( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 187 OF 1240 ***
+    // Wavefunction(s) for diagram number 187
+    // (none)
+    // Amplitude(s) for diagram number 187
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram188( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 188 OF 1240 ***
+    // Wavefunction(s) for diagram number 188
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[34], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    // Amplitude(s) for diagram number 188
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram189( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 189 OF 1240 ***
+    // Wavefunction(s) for diagram number 189
+    // (none)
+    // Amplitude(s) for diagram number 189
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram190( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 190 OF 1240 ***
+    // Wavefunction(s) for diagram number 190
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[78] );
+    // Amplitude(s) for diagram number 190
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[55], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram191( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 191 OF 1240 ***
+    // Wavefunction(s) for diagram number 191
+    // (none)
+    // Amplitude(s) for diagram number 191
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram192( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 192 OF 1240 ***
+    // Wavefunction(s) for diagram number 192
+    // (none)
+    // Amplitude(s) for diagram number 192
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[57], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram193( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 193 OF 1240 ***
+    // Wavefunction(s) for diagram number 193
+    // (none)
+    // Amplitude(s) for diagram number 193
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram194( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 194 OF 1240 ***
+    // Wavefunction(s) for diagram number 194
+    // (none)
+    // Amplitude(s) for diagram number 194
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram195( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 195 OF 1240 ***
+    // Wavefunction(s) for diagram number 195
+    // (none)
+    // Amplitude(s) for diagram number 195
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[73], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram196( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 196 OF 1240 ***
+    // Wavefunction(s) for diagram number 196
+    // (none)
+    // Amplitude(s) for diagram number 196
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[58], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram197( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 197 OF 1240 ***
+    // Wavefunction(s) for diagram number 197
+    // (none)
+    // Amplitude(s) for diagram number 197
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram198( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 198 OF 1240 ***
+    // Wavefunction(s) for diagram number 198
+    // (none)
+    // Amplitude(s) for diagram number 198
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram199( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 199 OF 1240 ***
+    // Wavefunction(s) for diagram number 199
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[58] );
+    // Amplitude(s) for diagram number 199
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram200( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 200 OF 1240 ***
+    // Wavefunction(s) for diagram number 200
+    // (none)
+    // Amplitude(s) for diagram number 200
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram201( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 201 OF 1240 ***
+    // Wavefunction(s) for diagram number 201
+    // (none)
+    // Amplitude(s) for diagram number 201
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[57], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram202( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 202 OF 1240 ***
+    // Wavefunction(s) for diagram number 202
+    // (none)
+    // Amplitude(s) for diagram number 202
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram203( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 203 OF 1240 ***
+    // Wavefunction(s) for diagram number 203
+    // (none)
+    // Amplitude(s) for diagram number 203
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram204( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 204 OF 1240 ***
+    // Wavefunction(s) for diagram number 204
+    // (none)
+    // Amplitude(s) for diagram number 204
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[67], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram205( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 205 OF 1240 ***
+    // Wavefunction(s) for diagram number 205
+    // (none)
+    // Amplitude(s) for diagram number 205
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[60], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram206( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 206 OF 1240 ***
+    // Wavefunction(s) for diagram number 206
+    // (none)
+    // Amplitude(s) for diagram number 206
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram207( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 207 OF 1240 ***
+    // Wavefunction(s) for diagram number 207
+    // (none)
+    // Amplitude(s) for diagram number 207
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram208( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 208 OF 1240 ***
+    // Wavefunction(s) for diagram number 208
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    // Amplitude(s) for diagram number 208
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram209( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 209 OF 1240 ***
+    // Wavefunction(s) for diagram number 209
+    // (none)
+    // Amplitude(s) for diagram number 209
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram210( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 210 OF 1240 ***
+    // Wavefunction(s) for diagram number 210
+    // (none)
+    // Amplitude(s) for diagram number 210
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[55], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram211( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 211 OF 1240 ***
+    // Wavefunction(s) for diagram number 211
+    // (none)
+    // Amplitude(s) for diagram number 211
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram212( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 212 OF 1240 ***
+    // Wavefunction(s) for diagram number 212
+    // (none)
+    // Amplitude(s) for diagram number 212
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram213( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 213 OF 1240 ***
+    // Wavefunction(s) for diagram number 213
+    // (none)
+    // Amplitude(s) for diagram number 213
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram214( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 214 OF 1240 ***
+    // Wavefunction(s) for diagram number 214
+    // (none)
+    // Amplitude(s) for diagram number 214
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram215( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 215 OF 1240 ***
+    // Wavefunction(s) for diagram number 215
+    // (none)
+    // Amplitude(s) for diagram number 215
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram216( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 216 OF 1240 ***
+    // Wavefunction(s) for diagram number 216
+    // (none)
+    // Amplitude(s) for diagram number 216
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram217( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 217 OF 1240 ***
+    // Wavefunction(s) for diagram number 217
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[59] );
+    // Amplitude(s) for diagram number 217
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram218( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 218 OF 1240 ***
+    // Wavefunction(s) for diagram number 218
+    // (none)
+    // Amplitude(s) for diagram number 218
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram219( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 219 OF 1240 ***
+    // Wavefunction(s) for diagram number 219
+    // (none)
+    // Amplitude(s) for diagram number 219
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram220( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 220 OF 1240 ***
+    // Wavefunction(s) for diagram number 220
+    // (none)
+    // Amplitude(s) for diagram number 220
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram221( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 221 OF 1240 ***
+    // Wavefunction(s) for diagram number 221
+    // (none)
+    // Amplitude(s) for diagram number 221
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[57], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram222( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 222 OF 1240 ***
+    // Wavefunction(s) for diagram number 222
+    // (none)
+    // Amplitude(s) for diagram number 222
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram223( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 223 OF 1240 ***
+    // Wavefunction(s) for diagram number 223
+    // (none)
+    // Amplitude(s) for diagram number 223
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram224( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 224 OF 1240 ***
+    // Wavefunction(s) for diagram number 224
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 224
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram225( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 225 OF 1240 ***
+    // Wavefunction(s) for diagram number 225
+    // (none)
+    // Amplitude(s) for diagram number 225
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram226( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 226 OF 1240 ***
+    // Wavefunction(s) for diagram number 226
+    // (none)
+    // Amplitude(s) for diagram number 226
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram227( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 227 OF 1240 ***
+    // Wavefunction(s) for diagram number 227
+    // (none)
+    // Amplitude(s) for diagram number 227
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram228( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 228 OF 1240 ***
+    // Wavefunction(s) for diagram number 228
+    // (none)
+    // Amplitude(s) for diagram number 228
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[55], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram229( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 229 OF 1240 ***
+    // Wavefunction(s) for diagram number 229
+    // (none)
+    // Amplitude(s) for diagram number 229
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram230( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 230 OF 1240 ***
+    // Wavefunction(s) for diagram number 230
+    // (none)
+    // Amplitude(s) for diagram number 230
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram231( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 231 OF 1240 ***
+    // Wavefunction(s) for diagram number 231
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[67] );
+    // Amplitude(s) for diagram number 231
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram232( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 232 OF 1240 ***
+    // Wavefunction(s) for diagram number 232
+    // (none)
+    // Amplitude(s) for diagram number 232
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram233( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 233 OF 1240 ***
+    // Wavefunction(s) for diagram number 233
+    // (none)
+    // Amplitude(s) for diagram number 233
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram234( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 234 OF 1240 ***
+    // Wavefunction(s) for diagram number 234
+    // (none)
+    // Amplitude(s) for diagram number 234
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram235( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 235 OF 1240 ***
+    // Wavefunction(s) for diagram number 235
+    // (none)
+    // Amplitude(s) for diagram number 235
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram236( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 236 OF 1240 ***
+    // Wavefunction(s) for diagram number 236
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[73] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[79] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[80] );
+    // Amplitude(s) for diagram number 236
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram237( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 237 OF 1240 ***
+    // Wavefunction(s) for diagram number 237
+    // (none)
+    // Amplitude(s) for diagram number 237
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[57], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram238( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 238 OF 1240 ***
+    // Wavefunction(s) for diagram number 238
+    // (none)
+    // Amplitude(s) for diagram number 238
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[34], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram239( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 239 OF 1240 ***
+    // Wavefunction(s) for diagram number 239
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[57] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[81] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[82] );
+    // Amplitude(s) for diagram number 239
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram240( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 240 OF 1240 ***
+    // Wavefunction(s) for diagram number 240
+    // (none)
+    // Amplitude(s) for diagram number 240
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[55], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram241( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 241 OF 1240 ***
+    // Wavefunction(s) for diagram number 241
+    // (none)
+    // Amplitude(s) for diagram number 241
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[34], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram242( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 242 OF 1240 ***
+    // Wavefunction(s) for diagram number 242
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[55] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[83] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[84] );
+    // Amplitude(s) for diagram number 242
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram243( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 243 OF 1240 ***
+    // Wavefunction(s) for diagram number 243
+    // (none)
+    // Amplitude(s) for diagram number 243
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram244( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 244 OF 1240 ***
+    // Wavefunction(s) for diagram number 244
+    // (none)
+    // Amplitude(s) for diagram number 244
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[34], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram245( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 245 OF 1240 ***
+    // Wavefunction(s) for diagram number 245
+    // (none)
+    // Amplitude(s) for diagram number 245
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram246( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 246 OF 1240 ***
+    // Wavefunction(s) for diagram number 246
+    // (none)
+    // Amplitude(s) for diagram number 246
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram247( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 247 OF 1240 ***
+    // Wavefunction(s) for diagram number 247
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[34] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 247
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram248( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 248 OF 1240 ***
+    // Wavefunction(s) for diagram number 248
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[85] );
+    // Amplitude(s) for diagram number 248
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram249( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 249 OF 1240 ***
+    // Wavefunction(s) for diagram number 249
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[87] );
+    // Amplitude(s) for diagram number 249
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram250( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 250 OF 1240 ***
+    // Wavefunction(s) for diagram number 250
+    // (none)
+    // Amplitude(s) for diagram number 250
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram251( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 251 OF 1240 ***
+    // Wavefunction(s) for diagram number 251
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
+    // Amplitude(s) for diagram number 251
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram252( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 252 OF 1240 ***
+    // Wavefunction(s) for diagram number 252
+    // (none)
+    // Amplitude(s) for diagram number 252
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram253( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 253 OF 1240 ***
+    // Wavefunction(s) for diagram number 253
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[89] );
+    // Amplitude(s) for diagram number 253
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram254( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 254 OF 1240 ***
+    // Wavefunction(s) for diagram number 254
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
+    // Amplitude(s) for diagram number 254
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram255( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 255 OF 1240 ***
+    // Wavefunction(s) for diagram number 255
+    // (none)
+    // Amplitude(s) for diagram number 255
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram256( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 256 OF 1240 ***
+    // Wavefunction(s) for diagram number 256
+    // (none)
+    // Amplitude(s) for diagram number 256
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram257( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 257 OF 1240 ***
+    // Wavefunction(s) for diagram number 257
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
+    // Amplitude(s) for diagram number 257
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram258( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 258 OF 1240 ***
+    // Wavefunction(s) for diagram number 258
+    // (none)
+    // Amplitude(s) for diagram number 258
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram259( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 259 OF 1240 ***
+    // Wavefunction(s) for diagram number 259
+    // (none)
+    // Amplitude(s) for diagram number 259
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[89], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram260( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 260 OF 1240 ***
+    // Wavefunction(s) for diagram number 260
+    // (none)
+    // Amplitude(s) for diagram number 260
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram261( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 261 OF 1240 ***
+    // Wavefunction(s) for diagram number 261
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
+    // Amplitude(s) for diagram number 261
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram262( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 262 OF 1240 ***
+    // Wavefunction(s) for diagram number 262
+    // (none)
+    // Amplitude(s) for diagram number 262
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram263( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 263 OF 1240 ***
+    // Wavefunction(s) for diagram number 263
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[92] );
+    // Amplitude(s) for diagram number 263
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram264( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 264 OF 1240 ***
+    // Wavefunction(s) for diagram number 264
+    // (none)
+    // Amplitude(s) for diagram number 264
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram265( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 265 OF 1240 ***
+    // Wavefunction(s) for diagram number 265
+    // (none)
+    // Amplitude(s) for diagram number 265
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram266( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 266 OF 1240 ***
+    // Wavefunction(s) for diagram number 266
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[93] );
+    // Amplitude(s) for diagram number 266
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram267( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 267 OF 1240 ***
+    // Wavefunction(s) for diagram number 267
+    // (none)
+    // Amplitude(s) for diagram number 267
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram268( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 268 OF 1240 ***
+    // Wavefunction(s) for diagram number 268
+    // (none)
+    // Amplitude(s) for diagram number 268
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram269( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 269 OF 1240 ***
+    // Wavefunction(s) for diagram number 269
+    // (none)
+    // Amplitude(s) for diagram number 269
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram270( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 270 OF 1240 ***
+    // Wavefunction(s) for diagram number 270
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[61], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
+    // Amplitude(s) for diagram number 270
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram271( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 271 OF 1240 ***
+    // Wavefunction(s) for diagram number 271
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], COUPs[1], 1.0, 0., 0., w_fp[95] );
+    // Amplitude(s) for diagram number 271
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram272( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 272 OF 1240 ***
+    // Wavefunction(s) for diagram number 272
+    // (none)
+    // Amplitude(s) for diagram number 272
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram273( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 273 OF 1240 ***
+    // Wavefunction(s) for diagram number 273
+    // (none)
+    // Amplitude(s) for diagram number 273
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram274( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 274 OF 1240 ***
+    // Wavefunction(s) for diagram number 274
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], COUPs[1], 1.0, 0., 0., w_fp[96] );
+    // Amplitude(s) for diagram number 274
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram275( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 275 OF 1240 ***
+    // Wavefunction(s) for diagram number 275
+    // (none)
+    // Amplitude(s) for diagram number 275
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram276( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 276 OF 1240 ***
+    // Wavefunction(s) for diagram number 276
+    // (none)
+    // Amplitude(s) for diagram number 276
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[94], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram277( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 277 OF 1240 ***
+    // Wavefunction(s) for diagram number 277
+    // (none)
+    // Amplitude(s) for diagram number 277
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram278( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 278 OF 1240 ***
+    // Wavefunction(s) for diagram number 278
+    // (none)
+    // Amplitude(s) for diagram number 278
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram279( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 279 OF 1240 ***
+    // Wavefunction(s) for diagram number 279
+    // (none)
+    // Amplitude(s) for diagram number 279
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram280( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 280 OF 1240 ***
+    // Wavefunction(s) for diagram number 280
+    // (none)
+    // Amplitude(s) for diagram number 280
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram281( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 281 OF 1240 ***
+    // Wavefunction(s) for diagram number 281
+    // (none)
+    // Amplitude(s) for diagram number 281
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram282( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 282 OF 1240 ***
+    // Wavefunction(s) for diagram number 282
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[94] );
+    // Amplitude(s) for diagram number 282
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram283( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 283 OF 1240 ***
+    // Wavefunction(s) for diagram number 283
+    // (none)
+    // Amplitude(s) for diagram number 283
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram284( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 284 OF 1240 ***
+    // Wavefunction(s) for diagram number 284
+    // (none)
+    // Amplitude(s) for diagram number 284
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram285( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 285 OF 1240 ***
+    // Wavefunction(s) for diagram number 285
+    // (none)
+    // Amplitude(s) for diagram number 285
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram286( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 286 OF 1240 ***
+    // Wavefunction(s) for diagram number 286
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[66], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
+    // Amplitude(s) for diagram number 286
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram287( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 287 OF 1240 ***
+    // Wavefunction(s) for diagram number 287
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], COUPs[1], 1.0, 0., 0., w_fp[98] );
+    // Amplitude(s) for diagram number 287
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram288( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 288 OF 1240 ***
+    // Wavefunction(s) for diagram number 288
+    // (none)
+    // Amplitude(s) for diagram number 288
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram289( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 289 OF 1240 ***
+    // Wavefunction(s) for diagram number 289
+    // (none)
+    // Amplitude(s) for diagram number 289
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram290( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 290 OF 1240 ***
+    // Wavefunction(s) for diagram number 290
+    // (none)
+    // Amplitude(s) for diagram number 290
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram291( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 291 OF 1240 ***
+    // Wavefunction(s) for diagram number 291
+    // (none)
+    // Amplitude(s) for diagram number 291
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram292( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 292 OF 1240 ***
+    // Wavefunction(s) for diagram number 292
+    // (none)
+    // Amplitude(s) for diagram number 292
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram293( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 293 OF 1240 ***
+    // Wavefunction(s) for diagram number 293
+    // (none)
+    // Amplitude(s) for diagram number 293
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram294( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 294 OF 1240 ***
+    // Wavefunction(s) for diagram number 294
+    // (none)
+    // Amplitude(s) for diagram number 294
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram295( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 295 OF 1240 ***
+    // Wavefunction(s) for diagram number 295
+    // (none)
+    // Amplitude(s) for diagram number 295
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram296( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 296 OF 1240 ***
+    // Wavefunction(s) for diagram number 296
+    // (none)
+    // Amplitude(s) for diagram number 296
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram297( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 297 OF 1240 ***
+    // Wavefunction(s) for diagram number 297
+    // (none)
+    // Amplitude(s) for diagram number 297
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram298( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 298 OF 1240 ***
+    // Wavefunction(s) for diagram number 298
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[97] );
+    // Amplitude(s) for diagram number 298
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram299( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 299 OF 1240 ***
+    // Wavefunction(s) for diagram number 299
+    // (none)
+    // Amplitude(s) for diagram number 299
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram300( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 300 OF 1240 ***
+    // Wavefunction(s) for diagram number 300
+    // (none)
+    // Amplitude(s) for diagram number 300
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram301( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 301 OF 1240 ***
+    // Wavefunction(s) for diagram number 301
+    // (none)
+    // Amplitude(s) for diagram number 301
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram302( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 302 OF 1240 ***
+    // Wavefunction(s) for diagram number 302
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[72], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 302
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram303( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 303 OF 1240 ***
+    // Wavefunction(s) for diagram number 303
+    // (none)
+    // Amplitude(s) for diagram number 303
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram304( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 304 OF 1240 ***
+    // Wavefunction(s) for diagram number 304
+    // (none)
+    // Amplitude(s) for diagram number 304
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram305( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 305 OF 1240 ***
+    // Wavefunction(s) for diagram number 305
+    // (none)
+    // Amplitude(s) for diagram number 305
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram306( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 306 OF 1240 ***
+    // Wavefunction(s) for diagram number 306
+    // (none)
+    // Amplitude(s) for diagram number 306
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram307( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 307 OF 1240 ***
+    // Wavefunction(s) for diagram number 307
+    // (none)
+    // Amplitude(s) for diagram number 307
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram308( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 308 OF 1240 ***
+    // Wavefunction(s) for diagram number 308
+    // (none)
+    // Amplitude(s) for diagram number 308
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram309( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 309 OF 1240 ***
+    // Wavefunction(s) for diagram number 309
+    // (none)
+    // Amplitude(s) for diagram number 309
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram310( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 310 OF 1240 ***
+    // Wavefunction(s) for diagram number 310
+    // (none)
+    // Amplitude(s) for diagram number 310
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram311( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 311 OF 1240 ***
+    // Wavefunction(s) for diagram number 311
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 311
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram312( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 312 OF 1240 ***
+    // Wavefunction(s) for diagram number 312
+    // (none)
+    // Amplitude(s) for diagram number 312
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram313( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 313 OF 1240 ***
+    // Wavefunction(s) for diagram number 313
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[100] );
+    // Amplitude(s) for diagram number 313
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram314( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 314 OF 1240 ***
+    // Wavefunction(s) for diagram number 314
+    // (none)
+    // Amplitude(s) for diagram number 314
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram315( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 315 OF 1240 ***
+    // Wavefunction(s) for diagram number 315
+    // (none)
+    // Amplitude(s) for diagram number 315
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram316( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 316 OF 1240 ***
+    // Wavefunction(s) for diagram number 316
+    // (none)
+    // Amplitude(s) for diagram number 316
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram317( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 317 OF 1240 ***
+    // Wavefunction(s) for diagram number 317
+    // (none)
+    // Amplitude(s) for diagram number 317
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram318( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 318 OF 1240 ***
+    // Wavefunction(s) for diagram number 318
+    // (none)
+    // Amplitude(s) for diagram number 318
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram319( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 319 OF 1240 ***
+    // Wavefunction(s) for diagram number 319
+    // (none)
+    // Amplitude(s) for diagram number 319
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[89], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram320( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 320 OF 1240 ***
+    // Wavefunction(s) for diagram number 320
+    // (none)
+    // Amplitude(s) for diagram number 320
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram321( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 321 OF 1240 ***
+    // Wavefunction(s) for diagram number 321
+    // (none)
+    // Amplitude(s) for diagram number 321
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram322( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 322 OF 1240 ***
+    // Wavefunction(s) for diagram number 322
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[89] );
+    // Amplitude(s) for diagram number 322
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram323( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 323 OF 1240 ***
+    // Wavefunction(s) for diagram number 323
+    // (none)
+    // Amplitude(s) for diagram number 323
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram324( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 324 OF 1240 ***
+    // Wavefunction(s) for diagram number 324
+    // (none)
+    // Amplitude(s) for diagram number 324
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram325( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 325 OF 1240 ***
+    // Wavefunction(s) for diagram number 325
+    // (none)
+    // Amplitude(s) for diagram number 325
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram326( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 326 OF 1240 ***
+    // Wavefunction(s) for diagram number 326
+    // (none)
+    // Amplitude(s) for diagram number 326
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram327( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 327 OF 1240 ***
+    // Wavefunction(s) for diagram number 327
+    // (none)
+    // Amplitude(s) for diagram number 327
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram328( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 328 OF 1240 ***
+    // Wavefunction(s) for diagram number 328
+    // (none)
+    // Amplitude(s) for diagram number 328
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[91], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram329( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 329 OF 1240 ***
+    // Wavefunction(s) for diagram number 329
+    // (none)
+    // Amplitude(s) for diagram number 329
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram330( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 330 OF 1240 ***
+    // Wavefunction(s) for diagram number 330
+    // (none)
+    // Amplitude(s) for diagram number 330
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram331( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 331 OF 1240 ***
+    // Wavefunction(s) for diagram number 331
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[91] );
+    // Amplitude(s) for diagram number 331
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram332( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 332 OF 1240 ***
+    // Wavefunction(s) for diagram number 332
+    // (none)
+    // Amplitude(s) for diagram number 332
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram333( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 333 OF 1240 ***
+    // Wavefunction(s) for diagram number 333
+    // (none)
+    // Amplitude(s) for diagram number 333
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram334( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 334 OF 1240 ***
+    // Wavefunction(s) for diagram number 334
+    // (none)
+    // Amplitude(s) for diagram number 334
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram335( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 335 OF 1240 ***
+    // Wavefunction(s) for diagram number 335
+    // (none)
+    // Amplitude(s) for diagram number 335
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram336( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 336 OF 1240 ***
+    // Wavefunction(s) for diagram number 336
+    // (none)
+    // Amplitude(s) for diagram number 336
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram337( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 337 OF 1240 ***
+    // Wavefunction(s) for diagram number 337
+    // (none)
+    // Amplitude(s) for diagram number 337
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram338( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 338 OF 1240 ***
+    // Wavefunction(s) for diagram number 338
+    // (none)
+    // Amplitude(s) for diagram number 338
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram339( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 339 OF 1240 ***
+    // Wavefunction(s) for diagram number 339
+    // (none)
+    // Amplitude(s) for diagram number 339
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram340( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 340 OF 1240 ***
+    // Wavefunction(s) for diagram number 340
+    // (none)
+    // Amplitude(s) for diagram number 340
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram341( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 341 OF 1240 ***
+    // Wavefunction(s) for diagram number 341
+    // (none)
+    // Amplitude(s) for diagram number 341
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram342( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 342 OF 1240 ***
+    // Wavefunction(s) for diagram number 342
+    // (none)
+    // Amplitude(s) for diagram number 342
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram343( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 343 OF 1240 ***
+    // Wavefunction(s) for diagram number 343
+    // (none)
+    // Amplitude(s) for diagram number 343
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram344( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 344 OF 1240 ***
+    // Wavefunction(s) for diagram number 344
+    // (none)
+    // Amplitude(s) for diagram number 344
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram345( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 345 OF 1240 ***
+    // Wavefunction(s) for diagram number 345
+    // (none)
+    // Amplitude(s) for diagram number 345
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram346( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 346 OF 1240 ***
+    // Wavefunction(s) for diagram number 346
+    // (none)
+    // Amplitude(s) for diagram number 346
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram347( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 347 OF 1240 ***
+    // Wavefunction(s) for diagram number 347
+    // (none)
+    // Amplitude(s) for diagram number 347
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram348( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 348 OF 1240 ***
+    // Wavefunction(s) for diagram number 348
+    // (none)
+    // Amplitude(s) for diagram number 348
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram349( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 349 OF 1240 ***
+    // Wavefunction(s) for diagram number 349
+    // (none)
+    // Amplitude(s) for diagram number 349
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram350( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 350 OF 1240 ***
+    // Wavefunction(s) for diagram number 350
+    // (none)
+    // Amplitude(s) for diagram number 350
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram351( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 351 OF 1240 ***
+    // Wavefunction(s) for diagram number 351
+    // (none)
+    // Amplitude(s) for diagram number 351
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram352( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 352 OF 1240 ***
+    // Wavefunction(s) for diagram number 352
+    // (none)
+    // Amplitude(s) for diagram number 352
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram353( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 353 OF 1240 ***
+    // Wavefunction(s) for diagram number 353
+    // (none)
+    // Amplitude(s) for diagram number 353
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram354( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 354 OF 1240 ***
+    // Wavefunction(s) for diagram number 354
+    // (none)
+    // Amplitude(s) for diagram number 354
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram355( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 355 OF 1240 ***
+    // Wavefunction(s) for diagram number 355
+    // (none)
+    // Amplitude(s) for diagram number 355
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram356( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 356 OF 1240 ***
+    // Wavefunction(s) for diagram number 356
+    // (none)
+    // Amplitude(s) for diagram number 356
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[92], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram357( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 357 OF 1240 ***
+    // Wavefunction(s) for diagram number 357
+    // (none)
+    // Amplitude(s) for diagram number 357
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram358( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 358 OF 1240 ***
+    // Wavefunction(s) for diagram number 358
+    // (none)
+    // Amplitude(s) for diagram number 358
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram359( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 359 OF 1240 ***
+    // Wavefunction(s) for diagram number 359
+    // (none)
+    // Amplitude(s) for diagram number 359
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram360( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 360 OF 1240 ***
+    // Wavefunction(s) for diagram number 360
+    // (none)
+    // Amplitude(s) for diagram number 360
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram361( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 361 OF 1240 ***
+    // Wavefunction(s) for diagram number 361
+    // (none)
+    // Amplitude(s) for diagram number 361
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[47], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram362( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 362 OF 1240 ***
+    // Wavefunction(s) for diagram number 362
+    // (none)
+    // Amplitude(s) for diagram number 362
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram363( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 363 OF 1240 ***
+    // Wavefunction(s) for diagram number 363
+    // (none)
+    // Amplitude(s) for diagram number 363
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram364( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 364 OF 1240 ***
+    // Wavefunction(s) for diagram number 364
+    // (none)
+    // Amplitude(s) for diagram number 364
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram365( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 365 OF 1240 ***
+    // Wavefunction(s) for diagram number 365
+    // (none)
+    // Amplitude(s) for diagram number 365
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram366( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 366 OF 1240 ***
+    // Wavefunction(s) for diagram number 366
+    // (none)
+    // Amplitude(s) for diagram number 366
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[34], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram367( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 367 OF 1240 ***
+    // Wavefunction(s) for diagram number 367
+    // (none)
+    // Amplitude(s) for diagram number 367
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram368( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 368 OF 1240 ***
+    // Wavefunction(s) for diagram number 368
+    // (none)
+    // Amplitude(s) for diagram number 368
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram369( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 369 OF 1240 ***
+    // Wavefunction(s) for diagram number 369
+    // (none)
+    // Amplitude(s) for diagram number 369
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram370( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 370 OF 1240 ***
+    // Wavefunction(s) for diagram number 370
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 370
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram371( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 371 OF 1240 ***
+    // Wavefunction(s) for diagram number 371
+    // (none)
+    // Amplitude(s) for diagram number 371
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram372( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 372 OF 1240 ***
+    // Wavefunction(s) for diagram number 372
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[62] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], COUPs[1], 1.0, 0., 0., w_fp[34] );
+    // Amplitude(s) for diagram number 372
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram373( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 373 OF 1240 ***
+    // Wavefunction(s) for diagram number 373
+    // (none)
+    // Amplitude(s) for diagram number 373
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram374( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 374 OF 1240 ***
+    // Wavefunction(s) for diagram number 374
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 374
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram375( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 375 OF 1240 ***
+    // Wavefunction(s) for diagram number 375
+    // (none)
+    // Amplitude(s) for diagram number 375
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram376( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 376 OF 1240 ***
+    // Wavefunction(s) for diagram number 376
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
+    // Amplitude(s) for diagram number 376
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram377( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 377 OF 1240 ***
+    // Wavefunction(s) for diagram number 377
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[95] );
+    // Amplitude(s) for diagram number 377
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[95], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram378( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 378 OF 1240 ***
+    // Wavefunction(s) for diagram number 378
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 378
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram379( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 379 OF 1240 ***
+    // Wavefunction(s) for diagram number 379
+    // (none)
+    // Amplitude(s) for diagram number 379
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram380( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 380 OF 1240 ***
+    // Wavefunction(s) for diagram number 380
+    // (none)
+    // Amplitude(s) for diagram number 380
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[95], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram381( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 381 OF 1240 ***
+    // Wavefunction(s) for diagram number 381
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[101] );
+    // Amplitude(s) for diagram number 381
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram382( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 382 OF 1240 ***
+    // Wavefunction(s) for diagram number 382
+    // (none)
+    // Amplitude(s) for diagram number 382
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram383( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 383 OF 1240 ***
+    // Wavefunction(s) for diagram number 383
+    // (none)
+    // Amplitude(s) for diagram number 383
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[95], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram384( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 384 OF 1240 ***
+    // Wavefunction(s) for diagram number 384
+    // (none)
+    // Amplitude(s) for diagram number 384
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram385( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 385 OF 1240 ***
+    // Wavefunction(s) for diagram number 385
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[95] );
+    // Amplitude(s) for diagram number 385
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram386( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 386 OF 1240 ***
+    // Wavefunction(s) for diagram number 386
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
+    // Amplitude(s) for diagram number 386
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram387( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 387 OF 1240 ***
+    // Wavefunction(s) for diagram number 387
+    // (none)
+    // Amplitude(s) for diagram number 387
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram388( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 388 OF 1240 ***
+    // Wavefunction(s) for diagram number 388
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[103] );
+    // Amplitude(s) for diagram number 388
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram389( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 389 OF 1240 ***
+    // Wavefunction(s) for diagram number 389
+    // (none)
+    // Amplitude(s) for diagram number 389
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram390( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 390 OF 1240 ***
+    // Wavefunction(s) for diagram number 390
+    // (none)
+    // Amplitude(s) for diagram number 390
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram391( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 391 OF 1240 ***
+    // Wavefunction(s) for diagram number 391
+    // (none)
+    // Amplitude(s) for diagram number 391
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram392( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 392 OF 1240 ***
+    // Wavefunction(s) for diagram number 392
+    // (none)
+    // Amplitude(s) for diagram number 392
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram393( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 393 OF 1240 ***
+    // Wavefunction(s) for diagram number 393
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
+    // Amplitude(s) for diagram number 393
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram394( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 394 OF 1240 ***
+    // Wavefunction(s) for diagram number 394
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[105] );
+    // Amplitude(s) for diagram number 394
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[105], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram395( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 395 OF 1240 ***
+    // Wavefunction(s) for diagram number 395
+    // (none)
+    // Amplitude(s) for diagram number 395
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram396( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 396 OF 1240 ***
+    // Wavefunction(s) for diagram number 396
+    // (none)
+    // Amplitude(s) for diagram number 396
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram397( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 397 OF 1240 ***
+    // Wavefunction(s) for diagram number 397
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
+    // Amplitude(s) for diagram number 397
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram398( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 398 OF 1240 ***
+    // Wavefunction(s) for diagram number 398
+    // (none)
+    // Amplitude(s) for diagram number 398
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram399( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 399 OF 1240 ***
+    // Wavefunction(s) for diagram number 399
+    // (none)
+    // Amplitude(s) for diagram number 399
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram400( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 400 OF 1240 ***
+    // Wavefunction(s) for diagram number 400
+    // (none)
+    // Amplitude(s) for diagram number 400
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram401( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 401 OF 1240 ***
+    // Wavefunction(s) for diagram number 401
+    // (none)
+    // Amplitude(s) for diagram number 401
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram402( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 402 OF 1240 ***
+    // Wavefunction(s) for diagram number 402
+    // (none)
+    // Amplitude(s) for diagram number 402
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram403( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 403 OF 1240 ***
+    // Wavefunction(s) for diagram number 403
+    // (none)
+    // Amplitude(s) for diagram number 403
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram404( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 404 OF 1240 ***
+    // Wavefunction(s) for diagram number 404
+    // (none)
+    // Amplitude(s) for diagram number 404
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram405( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 405 OF 1240 ***
+    // Wavefunction(s) for diagram number 405
+    // (none)
+    // Amplitude(s) for diagram number 405
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram406( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 406 OF 1240 ***
+    // Wavefunction(s) for diagram number 406
+    // (none)
+    // Amplitude(s) for diagram number 406
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram407( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 407 OF 1240 ***
+    // Wavefunction(s) for diagram number 407
+    // (none)
+    // Amplitude(s) for diagram number 407
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram408( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 408 OF 1240 ***
+    // Wavefunction(s) for diagram number 408
+    // (none)
+    // Amplitude(s) for diagram number 408
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram409( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 409 OF 1240 ***
+    // Wavefunction(s) for diagram number 409
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 409
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram410( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 410 OF 1240 ***
+    // Wavefunction(s) for diagram number 410
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[107] );
+    // Amplitude(s) for diagram number 410
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram411( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 411 OF 1240 ***
+    // Wavefunction(s) for diagram number 411
+    // (none)
+    // Amplitude(s) for diagram number 411
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram412( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 412 OF 1240 ***
+    // Wavefunction(s) for diagram number 412
+    // (none)
+    // Amplitude(s) for diagram number 412
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram413( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 413 OF 1240 ***
+    // Wavefunction(s) for diagram number 413
+    // (none)
+    // Amplitude(s) for diagram number 413
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram414( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 414 OF 1240 ***
+    // Wavefunction(s) for diagram number 414
+    // (none)
+    // Amplitude(s) for diagram number 414
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram415( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 415 OF 1240 ***
+    // Wavefunction(s) for diagram number 415
+    // (none)
+    // Amplitude(s) for diagram number 415
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram416( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 416 OF 1240 ***
+    // Wavefunction(s) for diagram number 416
+    // (none)
+    // Amplitude(s) for diagram number 416
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram417( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 417 OF 1240 ***
+    // Wavefunction(s) for diagram number 417
+    // (none)
+    // Amplitude(s) for diagram number 417
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram418( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 418 OF 1240 ***
+    // Wavefunction(s) for diagram number 418
+    // (none)
+    // Amplitude(s) for diagram number 418
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram419( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 419 OF 1240 ***
+    // Wavefunction(s) for diagram number 419
+    // (none)
+    // Amplitude(s) for diagram number 419
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram420( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 420 OF 1240 ***
+    // Wavefunction(s) for diagram number 420
+    // (none)
+    // Amplitude(s) for diagram number 420
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram421( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 421 OF 1240 ***
+    // Wavefunction(s) for diagram number 421
+    // (none)
+    // Amplitude(s) for diagram number 421
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram422( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 422 OF 1240 ***
+    // Wavefunction(s) for diagram number 422
+    // (none)
+    // Amplitude(s) for diagram number 422
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram423( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 423 OF 1240 ***
+    // Wavefunction(s) for diagram number 423
+    // (none)
+    // Amplitude(s) for diagram number 423
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram424( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 424 OF 1240 ***
+    // Wavefunction(s) for diagram number 424
+    // (none)
+    // Amplitude(s) for diagram number 424
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram425( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 425 OF 1240 ***
+    // Wavefunction(s) for diagram number 425
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 425
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram426( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 426 OF 1240 ***
+    // Wavefunction(s) for diagram number 426
+    // (none)
+    // Amplitude(s) for diagram number 426
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram427( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 427 OF 1240 ***
+    // Wavefunction(s) for diagram number 427
+    // (none)
+    // Amplitude(s) for diagram number 427
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram428( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 428 OF 1240 ***
+    // Wavefunction(s) for diagram number 428
+    // (none)
+    // Amplitude(s) for diagram number 428
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram429( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 429 OF 1240 ***
+    // Wavefunction(s) for diagram number 429
+    // (none)
+    // Amplitude(s) for diagram number 429
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[105], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram430( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 430 OF 1240 ***
+    // Wavefunction(s) for diagram number 430
+    // (none)
+    // Amplitude(s) for diagram number 430
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram431( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 431 OF 1240 ***
+    // Wavefunction(s) for diagram number 431
+    // (none)
+    // Amplitude(s) for diagram number 431
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram432( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 432 OF 1240 ***
+    // Wavefunction(s) for diagram number 432
+    // (none)
+    // Amplitude(s) for diagram number 432
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram433( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 433 OF 1240 ***
+    // Wavefunction(s) for diagram number 433
+    // (none)
+    // Amplitude(s) for diagram number 433
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram434( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 434 OF 1240 ***
+    // Wavefunction(s) for diagram number 434
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 434
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram435( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 435 OF 1240 ***
+    // Wavefunction(s) for diagram number 435
+    // (none)
+    // Amplitude(s) for diagram number 435
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram436( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 436 OF 1240 ***
+    // Wavefunction(s) for diagram number 436
+    // (none)
+    // Amplitude(s) for diagram number 436
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram437( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 437 OF 1240 ***
+    // Wavefunction(s) for diagram number 437
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[108] );
+    // Amplitude(s) for diagram number 437
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram438( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 438 OF 1240 ***
+    // Wavefunction(s) for diagram number 438
+    // (none)
+    // Amplitude(s) for diagram number 438
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram439( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 439 OF 1240 ***
+    // Wavefunction(s) for diagram number 439
+    // (none)
+    // Amplitude(s) for diagram number 439
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[62], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram440( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 440 OF 1240 ***
+    // Wavefunction(s) for diagram number 440
+    // (none)
+    // Amplitude(s) for diagram number 440
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram441( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 441 OF 1240 ***
+    // Wavefunction(s) for diagram number 441
+    // (none)
+    // Amplitude(s) for diagram number 441
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram442( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 442 OF 1240 ***
+    // Wavefunction(s) for diagram number 442
+    // (none)
+    // Amplitude(s) for diagram number 442
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram443( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 443 OF 1240 ***
+    // Wavefunction(s) for diagram number 443
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 443
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram444( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 444 OF 1240 ***
+    // Wavefunction(s) for diagram number 444
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[113] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[114] );
+    // Amplitude(s) for diagram number 444
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[114], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram445( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 445 OF 1240 ***
+    // Wavefunction(s) for diagram number 445
+    // (none)
+    // Amplitude(s) for diagram number 445
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram446( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 446 OF 1240 ***
+    // Wavefunction(s) for diagram number 446
+    // (none)
+    // Amplitude(s) for diagram number 446
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram447( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 447 OF 1240 ***
+    // Wavefunction(s) for diagram number 447
+    // (none)
+    // Amplitude(s) for diagram number 447
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram448( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 448 OF 1240 ***
+    // Wavefunction(s) for diagram number 448
+    // (none)
+    // Amplitude(s) for diagram number 448
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram449( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 449 OF 1240 ***
+    // Wavefunction(s) for diagram number 449
+    // (none)
+    // Amplitude(s) for diagram number 449
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram450( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 450 OF 1240 ***
+    // Wavefunction(s) for diagram number 450
+    // (none)
+    // Amplitude(s) for diagram number 450
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram451( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 451 OF 1240 ***
+    // Wavefunction(s) for diagram number 451
+    // (none)
+    // Amplitude(s) for diagram number 451
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram452( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 452 OF 1240 ***
+    // Wavefunction(s) for diagram number 452
+    // (none)
+    // Amplitude(s) for diagram number 452
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram453( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 453 OF 1240 ***
+    // Wavefunction(s) for diagram number 453
+    // (none)
+    // Amplitude(s) for diagram number 453
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram454( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 454 OF 1240 ***
+    // Wavefunction(s) for diagram number 454
+    // (none)
+    // Amplitude(s) for diagram number 454
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram455( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 455 OF 1240 ***
+    // Wavefunction(s) for diagram number 455
+    // (none)
+    // Amplitude(s) for diagram number 455
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram456( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 456 OF 1240 ***
+    // Wavefunction(s) for diagram number 456
+    // (none)
+    // Amplitude(s) for diagram number 456
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram457( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 457 OF 1240 ***
+    // Wavefunction(s) for diagram number 457
+    // (none)
+    // Amplitude(s) for diagram number 457
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram458( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 458 OF 1240 ***
+    // Wavefunction(s) for diagram number 458
+    // (none)
+    // Amplitude(s) for diagram number 458
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[105], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram459( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 459 OF 1240 ***
+    // Wavefunction(s) for diagram number 459
+    // (none)
+    // Amplitude(s) for diagram number 459
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram460( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 460 OF 1240 ***
+    // Wavefunction(s) for diagram number 460
+    // (none)
+    // Amplitude(s) for diagram number 460
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram461( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 461 OF 1240 ***
+    // Wavefunction(s) for diagram number 461
+    // (none)
+    // Amplitude(s) for diagram number 461
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram462( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 462 OF 1240 ***
+    // Wavefunction(s) for diagram number 462
+    // (none)
+    // Amplitude(s) for diagram number 462
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram463( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 463 OF 1240 ***
+    // Wavefunction(s) for diagram number 463
+    // (none)
+    // Amplitude(s) for diagram number 463
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram464( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 464 OF 1240 ***
+    // Wavefunction(s) for diagram number 464
+    // (none)
+    // Amplitude(s) for diagram number 464
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram465( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 465 OF 1240 ***
+    // Wavefunction(s) for diagram number 465
+    // (none)
+    // Amplitude(s) for diagram number 465
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram466( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 466 OF 1240 ***
+    // Wavefunction(s) for diagram number 466
+    // (none)
+    // Amplitude(s) for diagram number 466
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram467( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 467 OF 1240 ***
+    // Wavefunction(s) for diagram number 467
+    // (none)
+    // Amplitude(s) for diagram number 467
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram468( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 468 OF 1240 ***
+    // Wavefunction(s) for diagram number 468
+    // (none)
+    // Amplitude(s) for diagram number 468
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram469( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 469 OF 1240 ***
+    // Wavefunction(s) for diagram number 469
+    // (none)
+    // Amplitude(s) for diagram number 469
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram470( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 470 OF 1240 ***
+    // Wavefunction(s) for diagram number 470
+    // (none)
+    // Amplitude(s) for diagram number 470
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram471( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 471 OF 1240 ***
+    // Wavefunction(s) for diagram number 471
+    // (none)
+    // Amplitude(s) for diagram number 471
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram472( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 472 OF 1240 ***
+    // Wavefunction(s) for diagram number 472
+    // (none)
+    // Amplitude(s) for diagram number 472
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram473( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 473 OF 1240 ***
+    // Wavefunction(s) for diagram number 473
+    // (none)
+    // Amplitude(s) for diagram number 473
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram474( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 474 OF 1240 ***
+    // Wavefunction(s) for diagram number 474
+    // (none)
+    // Amplitude(s) for diagram number 474
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram475( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 475 OF 1240 ***
+    // Wavefunction(s) for diagram number 475
+    // (none)
+    // Amplitude(s) for diagram number 475
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram476( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 476 OF 1240 ***
+    // Wavefunction(s) for diagram number 476
+    // (none)
+    // Amplitude(s) for diagram number 476
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[113], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[114], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram477( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 477 OF 1240 ***
+    // Wavefunction(s) for diagram number 477
+    // (none)
+    // Amplitude(s) for diagram number 477
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram478( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 478 OF 1240 ***
+    // Wavefunction(s) for diagram number 478
+    // (none)
+    // Amplitude(s) for diagram number 478
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram479( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 479 OF 1240 ***
+    // Wavefunction(s) for diagram number 479
+    // (none)
+    // Amplitude(s) for diagram number 479
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram480( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 480 OF 1240 ***
+    // Wavefunction(s) for diagram number 480
+    // (none)
+    // Amplitude(s) for diagram number 480
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram481( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 481 OF 1240 ***
+    // Wavefunction(s) for diagram number 481
+    // (none)
+    // Amplitude(s) for diagram number 481
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram482( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 482 OF 1240 ***
+    // Wavefunction(s) for diagram number 482
+    // (none)
+    // Amplitude(s) for diagram number 482
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram483( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 483 OF 1240 ***
+    // Wavefunction(s) for diagram number 483
+    // (none)
+    // Amplitude(s) for diagram number 483
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram484( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 484 OF 1240 ***
+    // Wavefunction(s) for diagram number 484
+    // (none)
+    // Amplitude(s) for diagram number 484
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram485( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 485 OF 1240 ***
+    // Wavefunction(s) for diagram number 485
+    // (none)
+    // Amplitude(s) for diagram number 485
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram486( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 486 OF 1240 ***
+    // Wavefunction(s) for diagram number 486
+    // (none)
+    // Amplitude(s) for diagram number 486
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram487( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 487 OF 1240 ***
+    // Wavefunction(s) for diagram number 487
+    // (none)
+    // Amplitude(s) for diagram number 487
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram488( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 488 OF 1240 ***
+    // Wavefunction(s) for diagram number 488
+    // (none)
+    // Amplitude(s) for diagram number 488
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram489( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 489 OF 1240 ***
+    // Wavefunction(s) for diagram number 489
+    // (none)
+    // Amplitude(s) for diagram number 489
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram490( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 490 OF 1240 ***
+    // Wavefunction(s) for diagram number 490
+    // (none)
+    // Amplitude(s) for diagram number 490
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram491( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 491 OF 1240 ***
+    // Wavefunction(s) for diagram number 491
+    // (none)
+    // Amplitude(s) for diagram number 491
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram492( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 492 OF 1240 ***
+    // Wavefunction(s) for diagram number 492
+    // (none)
+    // Amplitude(s) for diagram number 492
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[55], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[83], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[84], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram493( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 493 OF 1240 ***
+    // Wavefunction(s) for diagram number 493
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 493
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram494( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 494 OF 1240 ***
+    // Wavefunction(s) for diagram number 494
+    // (none)
+    // Amplitude(s) for diagram number 494
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram495( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 495 OF 1240 ***
+    // Wavefunction(s) for diagram number 495
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[102] );
+    // Amplitude(s) for diagram number 495
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram496( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 496 OF 1240 ***
+    // Wavefunction(s) for diagram number 496
+    // (none)
+    // Amplitude(s) for diagram number 496
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram497( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 497 OF 1240 ***
+    // Wavefunction(s) for diagram number 497
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 497
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram498( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 498 OF 1240 ***
+    // Wavefunction(s) for diagram number 498
+    // (none)
+    // Amplitude(s) for diagram number 498
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram499( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 499 OF 1240 ***
+    // Wavefunction(s) for diagram number 499
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    // Amplitude(s) for diagram number 499
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram500( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 500 OF 1240 ***
+    // Wavefunction(s) for diagram number 500
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+    // Amplitude(s) for diagram number 500
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[62], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram501( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 501 OF 1240 ***
+    // Wavefunction(s) for diagram number 501
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
+    // Amplitude(s) for diagram number 501
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram502( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 502 OF 1240 ***
+    // Wavefunction(s) for diagram number 502
+    // (none)
+    // Amplitude(s) for diagram number 502
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram503( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 503 OF 1240 ***
+    // Wavefunction(s) for diagram number 503
+    // (none)
+    // Amplitude(s) for diagram number 503
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[62], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram504( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 504 OF 1240 ***
+    // Wavefunction(s) for diagram number 504
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
+    // Amplitude(s) for diagram number 504
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram505( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 505 OF 1240 ***
+    // Wavefunction(s) for diagram number 505
+    // (none)
+    // Amplitude(s) for diagram number 505
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram506( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 506 OF 1240 ***
+    // Wavefunction(s) for diagram number 506
+    // (none)
+    // Amplitude(s) for diagram number 506
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[62], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram507( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 507 OF 1240 ***
+    // Wavefunction(s) for diagram number 507
+    // (none)
+    // Amplitude(s) for diagram number 507
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram508( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 508 OF 1240 ***
+    // Wavefunction(s) for diagram number 508
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[62] );
+    // Amplitude(s) for diagram number 508
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram509( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 509 OF 1240 ***
+    // Wavefunction(s) for diagram number 509
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[112] );
+    // Amplitude(s) for diagram number 509
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram510( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 510 OF 1240 ***
+    // Wavefunction(s) for diagram number 510
+    // (none)
+    // Amplitude(s) for diagram number 510
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram511( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 511 OF 1240 ***
+    // Wavefunction(s) for diagram number 511
+    // (none)
+    // Amplitude(s) for diagram number 511
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram512( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 512 OF 1240 ***
+    // Wavefunction(s) for diagram number 512
+    // (none)
+    // Amplitude(s) for diagram number 512
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram513( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 513 OF 1240 ***
+    // Wavefunction(s) for diagram number 513
+    // (none)
+    // Amplitude(s) for diagram number 513
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram514( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 514 OF 1240 ***
+    // Wavefunction(s) for diagram number 514
+    // (none)
+    // Amplitude(s) for diagram number 514
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram515( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 515 OF 1240 ***
+    // Wavefunction(s) for diagram number 515
+    // (none)
+    // Amplitude(s) for diagram number 515
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram516( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 516 OF 1240 ***
+    // Wavefunction(s) for diagram number 516
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[86] );
+    // Amplitude(s) for diagram number 516
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram517( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 517 OF 1240 ***
+    // Wavefunction(s) for diagram number 517
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 517
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[98], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram518( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 518 OF 1240 ***
+    // Wavefunction(s) for diagram number 518
+    // (none)
+    // Amplitude(s) for diagram number 518
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram519( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 519 OF 1240 ***
+    // Wavefunction(s) for diagram number 519
+    // (none)
+    // Amplitude(s) for diagram number 519
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram520( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 520 OF 1240 ***
+    // Wavefunction(s) for diagram number 520
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
+    // Amplitude(s) for diagram number 520
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram521( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 521 OF 1240 ***
+    // Wavefunction(s) for diagram number 521
+    // (none)
+    // Amplitude(s) for diagram number 521
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram522( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 522 OF 1240 ***
+    // Wavefunction(s) for diagram number 522
+    // (none)
+    // Amplitude(s) for diagram number 522
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram523( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 523 OF 1240 ***
+    // Wavefunction(s) for diagram number 523
+    // (none)
+    // Amplitude(s) for diagram number 523
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[112], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram524( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 524 OF 1240 ***
+    // Wavefunction(s) for diagram number 524
+    // (none)
+    // Amplitude(s) for diagram number 524
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram525( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 525 OF 1240 ***
+    // Wavefunction(s) for diagram number 525
+    // (none)
+    // Amplitude(s) for diagram number 525
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram526( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 526 OF 1240 ***
+    // Wavefunction(s) for diagram number 526
+    // (none)
+    // Amplitude(s) for diagram number 526
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram527( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 527 OF 1240 ***
+    // Wavefunction(s) for diagram number 527
+    // (none)
+    // Amplitude(s) for diagram number 527
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram528( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 528 OF 1240 ***
+    // Wavefunction(s) for diagram number 528
+    // (none)
+    // Amplitude(s) for diagram number 528
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram529( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 529 OF 1240 ***
+    // Wavefunction(s) for diagram number 529
+    // (none)
+    // Amplitude(s) for diagram number 529
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram530( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 530 OF 1240 ***
+    // Wavefunction(s) for diagram number 530
+    // (none)
+    // Amplitude(s) for diagram number 530
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram531( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 531 OF 1240 ***
+    // Wavefunction(s) for diagram number 531
+    // (none)
+    // Amplitude(s) for diagram number 531
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram532( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 532 OF 1240 ***
+    // Wavefunction(s) for diagram number 532
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 532
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram533( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 533 OF 1240 ***
+    // Wavefunction(s) for diagram number 533
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[101] );
+    // Amplitude(s) for diagram number 533
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram534( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 534 OF 1240 ***
+    // Wavefunction(s) for diagram number 534
+    // (none)
+    // Amplitude(s) for diagram number 534
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram535( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 535 OF 1240 ***
+    // Wavefunction(s) for diagram number 535
+    // (none)
+    // Amplitude(s) for diagram number 535
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram536( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 536 OF 1240 ***
+    // Wavefunction(s) for diagram number 536
+    // (none)
+    // Amplitude(s) for diagram number 536
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram537( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 537 OF 1240 ***
+    // Wavefunction(s) for diagram number 537
+    // (none)
+    // Amplitude(s) for diagram number 537
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram538( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 538 OF 1240 ***
+    // Wavefunction(s) for diagram number 538
+    // (none)
+    // Amplitude(s) for diagram number 538
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram539( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 539 OF 1240 ***
+    // Wavefunction(s) for diagram number 539
+    // (none)
+    // Amplitude(s) for diagram number 539
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[112], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram540( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 540 OF 1240 ***
+    // Wavefunction(s) for diagram number 540
+    // (none)
+    // Amplitude(s) for diagram number 540
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram541( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 541 OF 1240 ***
+    // Wavefunction(s) for diagram number 541
+    // (none)
+    // Amplitude(s) for diagram number 541
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram542( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 542 OF 1240 ***
+    // Wavefunction(s) for diagram number 542
+    // (none)
+    // Amplitude(s) for diagram number 542
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram543( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 543 OF 1240 ***
+    // Wavefunction(s) for diagram number 543
+    // (none)
+    // Amplitude(s) for diagram number 543
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram544( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 544 OF 1240 ***
+    // Wavefunction(s) for diagram number 544
+    // (none)
+    // Amplitude(s) for diagram number 544
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram545( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 545 OF 1240 ***
+    // Wavefunction(s) for diagram number 545
+    // (none)
+    // Amplitude(s) for diagram number 545
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram546( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 546 OF 1240 ***
+    // Wavefunction(s) for diagram number 546
+    // (none)
+    // Amplitude(s) for diagram number 546
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram547( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 547 OF 1240 ***
+    // Wavefunction(s) for diagram number 547
+    // (none)
+    // Amplitude(s) for diagram number 547
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram548( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 548 OF 1240 ***
+    // Wavefunction(s) for diagram number 548
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 548
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram549( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 549 OF 1240 ***
+    // Wavefunction(s) for diagram number 549
+    // (none)
+    // Amplitude(s) for diagram number 549
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram550( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 550 OF 1240 ***
+    // Wavefunction(s) for diagram number 550
+    // (none)
+    // Amplitude(s) for diagram number 550
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram551( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 551 OF 1240 ***
+    // Wavefunction(s) for diagram number 551
+    // (none)
+    // Amplitude(s) for diagram number 551
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram552( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 552 OF 1240 ***
+    // Wavefunction(s) for diagram number 552
+    // (none)
+    // Amplitude(s) for diagram number 552
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram553( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 553 OF 1240 ***
+    // Wavefunction(s) for diagram number 553
+    // (none)
+    // Amplitude(s) for diagram number 553
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram554( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 554 OF 1240 ***
+    // Wavefunction(s) for diagram number 554
+    // (none)
+    // Amplitude(s) for diagram number 554
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram555( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 555 OF 1240 ***
+    // Wavefunction(s) for diagram number 555
+    // (none)
+    // Amplitude(s) for diagram number 555
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[112], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram556( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 556 OF 1240 ***
+    // Wavefunction(s) for diagram number 556
+    // (none)
+    // Amplitude(s) for diagram number 556
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram557( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 557 OF 1240 ***
+    // Wavefunction(s) for diagram number 557
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 557
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram558( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 558 OF 1240 ***
+    // Wavefunction(s) for diagram number 558
+    // (none)
+    // Amplitude(s) for diagram number 558
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram559( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 559 OF 1240 ***
+    // Wavefunction(s) for diagram number 559
+    // (none)
+    // Amplitude(s) for diagram number 559
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram560( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 560 OF 1240 ***
+    // Wavefunction(s) for diagram number 560
+    // (none)
+    // Amplitude(s) for diagram number 560
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram561( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 561 OF 1240 ***
+    // Wavefunction(s) for diagram number 561
+    // (none)
+    // Amplitude(s) for diagram number 561
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram562( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 562 OF 1240 ***
+    // Wavefunction(s) for diagram number 562
+    // (none)
+    // Amplitude(s) for diagram number 562
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[102], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram563( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 563 OF 1240 ***
+    // Wavefunction(s) for diagram number 563
+    // (none)
+    // Amplitude(s) for diagram number 563
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram564( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 564 OF 1240 ***
+    // Wavefunction(s) for diagram number 564
+    // (none)
+    // Amplitude(s) for diagram number 564
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram565( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 565 OF 1240 ***
+    // Wavefunction(s) for diagram number 565
+    // (none)
+    // Amplitude(s) for diagram number 565
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram566( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 566 OF 1240 ***
+    // Wavefunction(s) for diagram number 566
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    // Amplitude(s) for diagram number 566
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram567( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 567 OF 1240 ***
+    // Wavefunction(s) for diagram number 567
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[96] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[88] );
+    // Amplitude(s) for diagram number 567
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram568( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 568 OF 1240 ***
+    // Wavefunction(s) for diagram number 568
+    // (none)
+    // Amplitude(s) for diagram number 568
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram569( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 569 OF 1240 ***
+    // Wavefunction(s) for diagram number 569
+    // (none)
+    // Amplitude(s) for diagram number 569
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram570( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 570 OF 1240 ***
+    // Wavefunction(s) for diagram number 570
+    // (none)
+    // Amplitude(s) for diagram number 570
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram571( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 571 OF 1240 ***
+    // Wavefunction(s) for diagram number 571
+    // (none)
+    // Amplitude(s) for diagram number 571
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram572( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 572 OF 1240 ***
+    // Wavefunction(s) for diagram number 572
+    // (none)
+    // Amplitude(s) for diagram number 572
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram573( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 573 OF 1240 ***
+    // Wavefunction(s) for diagram number 573
+    // (none)
+    // Amplitude(s) for diagram number 573
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram574( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 574 OF 1240 ***
+    // Wavefunction(s) for diagram number 574
+    // (none)
+    // Amplitude(s) for diagram number 574
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram575( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 575 OF 1240 ***
+    // Wavefunction(s) for diagram number 575
+    // (none)
+    // Amplitude(s) for diagram number 575
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram576( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 576 OF 1240 ***
+    // Wavefunction(s) for diagram number 576
+    // (none)
+    // Amplitude(s) for diagram number 576
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram577( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 577 OF 1240 ***
+    // Wavefunction(s) for diagram number 577
+    // (none)
+    // Amplitude(s) for diagram number 577
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram578( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 578 OF 1240 ***
+    // Wavefunction(s) for diagram number 578
+    // (none)
+    // Amplitude(s) for diagram number 578
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram579( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 579 OF 1240 ***
+    // Wavefunction(s) for diagram number 579
+    // (none)
+    // Amplitude(s) for diagram number 579
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram580( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 580 OF 1240 ***
+    // Wavefunction(s) for diagram number 580
+    // (none)
+    // Amplitude(s) for diagram number 580
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram581( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 581 OF 1240 ***
+    // Wavefunction(s) for diagram number 581
+    // (none)
+    // Amplitude(s) for diagram number 581
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[98], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram582( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 582 OF 1240 ***
+    // Wavefunction(s) for diagram number 582
+    // (none)
+    // Amplitude(s) for diagram number 582
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[113], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram583( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 583 OF 1240 ***
+    // Wavefunction(s) for diagram number 583
+    // (none)
+    // Amplitude(s) for diagram number 583
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram584( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 584 OF 1240 ***
+    // Wavefunction(s) for diagram number 584
+    // (none)
+    // Amplitude(s) for diagram number 584
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram585( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 585 OF 1240 ***
+    // Wavefunction(s) for diagram number 585
+    // (none)
+    // Amplitude(s) for diagram number 585
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram586( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 586 OF 1240 ***
+    // Wavefunction(s) for diagram number 586
+    // (none)
+    // Amplitude(s) for diagram number 586
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram587( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 587 OF 1240 ***
+    // Wavefunction(s) for diagram number 587
+    // (none)
+    // Amplitude(s) for diagram number 587
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram588( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 588 OF 1240 ***
+    // Wavefunction(s) for diagram number 588
+    // (none)
+    // Amplitude(s) for diagram number 588
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram589( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 589 OF 1240 ***
+    // Wavefunction(s) for diagram number 589
+    // (none)
+    // Amplitude(s) for diagram number 589
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram590( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 590 OF 1240 ***
+    // Wavefunction(s) for diagram number 590
+    // (none)
+    // Amplitude(s) for diagram number 590
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram591( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 591 OF 1240 ***
+    // Wavefunction(s) for diagram number 591
+    // (none)
+    // Amplitude(s) for diagram number 591
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram592( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 592 OF 1240 ***
+    // Wavefunction(s) for diagram number 592
+    // (none)
+    // Amplitude(s) for diagram number 592
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[114], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram593( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 593 OF 1240 ***
+    // Wavefunction(s) for diagram number 593
+    // (none)
+    // Amplitude(s) for diagram number 593
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram594( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 594 OF 1240 ***
+    // Wavefunction(s) for diagram number 594
+    // (none)
+    // Amplitude(s) for diagram number 594
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram595( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 595 OF 1240 ***
+    // Wavefunction(s) for diagram number 595
+    // (none)
+    // Amplitude(s) for diagram number 595
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[112], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram596( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 596 OF 1240 ***
+    // Wavefunction(s) for diagram number 596
+    // (none)
+    // Amplitude(s) for diagram number 596
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram597( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 597 OF 1240 ***
+    // Wavefunction(s) for diagram number 597
+    // (none)
+    // Amplitude(s) for diagram number 597
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram598( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 598 OF 1240 ***
+    // Wavefunction(s) for diagram number 598
+    // (none)
+    // Amplitude(s) for diagram number 598
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram599( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 599 OF 1240 ***
+    // Wavefunction(s) for diagram number 599
+    // (none)
+    // Amplitude(s) for diagram number 599
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram600( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 600 OF 1240 ***
+    // Wavefunction(s) for diagram number 600
+    // (none)
+    // Amplitude(s) for diagram number 600
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram601( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 601 OF 1240 ***
+    // Wavefunction(s) for diagram number 601
+    // (none)
+    // Amplitude(s) for diagram number 601
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram602( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 602 OF 1240 ***
+    // Wavefunction(s) for diagram number 602
+    // (none)
+    // Amplitude(s) for diagram number 602
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[112], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram603( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 603 OF 1240 ***
+    // Wavefunction(s) for diagram number 603
+    // (none)
+    // Amplitude(s) for diagram number 603
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram604( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 604 OF 1240 ***
+    // Wavefunction(s) for diagram number 604
+    // (none)
+    // Amplitude(s) for diagram number 604
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram605( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 605 OF 1240 ***
+    // Wavefunction(s) for diagram number 605
+    // (none)
+    // Amplitude(s) for diagram number 605
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[102], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram606( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 606 OF 1240 ***
+    // Wavefunction(s) for diagram number 606
+    // (none)
+    // Amplitude(s) for diagram number 606
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram607( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 607 OF 1240 ***
+    // Wavefunction(s) for diagram number 607
+    // (none)
+    // Amplitude(s) for diagram number 607
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram608( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 608 OF 1240 ***
+    // Wavefunction(s) for diagram number 608
+    // (none)
+    // Amplitude(s) for diagram number 608
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram609( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 609 OF 1240 ***
+    // Wavefunction(s) for diagram number 609
+    // (none)
+    // Amplitude(s) for diagram number 609
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram610( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 610 OF 1240 ***
+    // Wavefunction(s) for diagram number 610
+    // (none)
+    // Amplitude(s) for diagram number 610
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[112], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram611( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 611 OF 1240 ***
+    // Wavefunction(s) for diagram number 611
+    // (none)
+    // Amplitude(s) for diagram number 611
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram612( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 612 OF 1240 ***
+    // Wavefunction(s) for diagram number 612
+    // (none)
+    // Amplitude(s) for diagram number 612
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram613( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 613 OF 1240 ***
+    // Wavefunction(s) for diagram number 613
+    // (none)
+    // Amplitude(s) for diagram number 613
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[112], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram614( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 614 OF 1240 ***
+    // Wavefunction(s) for diagram number 614
+    // (none)
+    // Amplitude(s) for diagram number 614
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram615( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 615 OF 1240 ***
+    // Wavefunction(s) for diagram number 615
+    // (none)
+    // Amplitude(s) for diagram number 615
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[57], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[81], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[82], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram616( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 616 OF 1240 ***
+    // Wavefunction(s) for diagram number 616
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 616
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram617( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 617 OF 1240 ***
+    // Wavefunction(s) for diagram number 617
+    // (none)
+    // Amplitude(s) for diagram number 617
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram618( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 618 OF 1240 ***
+    // Wavefunction(s) for diagram number 618
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[112] );
+    // Amplitude(s) for diagram number 618
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram619( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 619 OF 1240 ***
+    // Wavefunction(s) for diagram number 619
+    // (none)
+    // Amplitude(s) for diagram number 619
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram620( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 620 OF 1240 ***
+    // Wavefunction(s) for diagram number 620
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 620
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram621( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 621 OF 1240 ***
+    // Wavefunction(s) for diagram number 621
+    // (none)
+    // Amplitude(s) for diagram number 621
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram622( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 622 OF 1240 ***
+    // Wavefunction(s) for diagram number 622
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    // Amplitude(s) for diagram number 622
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram623( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 623 OF 1240 ***
+    // Wavefunction(s) for diagram number 623
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
+    // Amplitude(s) for diagram number 623
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram624( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 624 OF 1240 ***
+    // Wavefunction(s) for diagram number 624
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[88] );
+    // Amplitude(s) for diagram number 624
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram625( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 625 OF 1240 ***
+    // Wavefunction(s) for diagram number 625
+    // (none)
+    // Amplitude(s) for diagram number 625
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram626( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 626 OF 1240 ***
+    // Wavefunction(s) for diagram number 626
+    // (none)
+    // Amplitude(s) for diagram number 626
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram627( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 627 OF 1240 ***
+    // Wavefunction(s) for diagram number 627
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
+    // Amplitude(s) for diagram number 627
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram628( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 628 OF 1240 ***
+    // Wavefunction(s) for diagram number 628
+    // (none)
+    // Amplitude(s) for diagram number 628
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram629( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 629 OF 1240 ***
+    // Wavefunction(s) for diagram number 629
+    // (none)
+    // Amplitude(s) for diagram number 629
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram630( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 630 OF 1240 ***
+    // Wavefunction(s) for diagram number 630
+    // (none)
+    // Amplitude(s) for diagram number 630
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram631( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 631 OF 1240 ***
+    // Wavefunction(s) for diagram number 631
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[102] );
+    // Amplitude(s) for diagram number 631
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram632( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 632 OF 1240 ***
+    // Wavefunction(s) for diagram number 632
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[96] );
+    // Amplitude(s) for diagram number 632
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram633( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 633 OF 1240 ***
+    // Wavefunction(s) for diagram number 633
+    // (none)
+    // Amplitude(s) for diagram number 633
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram634( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 634 OF 1240 ***
+    // Wavefunction(s) for diagram number 634
+    // (none)
+    // Amplitude(s) for diagram number 634
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram635( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 635 OF 1240 ***
+    // Wavefunction(s) for diagram number 635
+    // (none)
+    // Amplitude(s) for diagram number 635
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram636( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 636 OF 1240 ***
+    // Wavefunction(s) for diagram number 636
+    // (none)
+    // Amplitude(s) for diagram number 636
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram637( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 637 OF 1240 ***
+    // Wavefunction(s) for diagram number 637
+    // (none)
+    // Amplitude(s) for diagram number 637
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram638( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 638 OF 1240 ***
+    // Wavefunction(s) for diagram number 638
+    // (none)
+    // Amplitude(s) for diagram number 638
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram639( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 639 OF 1240 ***
+    // Wavefunction(s) for diagram number 639
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
+    // Amplitude(s) for diagram number 639
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram640( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 640 OF 1240 ***
+    // Wavefunction(s) for diagram number 640
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
+    // Amplitude(s) for diagram number 640
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram641( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 641 OF 1240 ***
+    // Wavefunction(s) for diagram number 641
+    // (none)
+    // Amplitude(s) for diagram number 641
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram642( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 642 OF 1240 ***
+    // Wavefunction(s) for diagram number 642
+    // (none)
+    // Amplitude(s) for diagram number 642
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram643( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 643 OF 1240 ***
+    // Wavefunction(s) for diagram number 643
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[92], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[106] );
+    // Amplitude(s) for diagram number 643
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[106], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram644( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 644 OF 1240 ***
+    // Wavefunction(s) for diagram number 644
+    // (none)
+    // Amplitude(s) for diagram number 644
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram645( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 645 OF 1240 ***
+    // Wavefunction(s) for diagram number 645
+    // (none)
+    // Amplitude(s) for diagram number 645
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram646( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 646 OF 1240 ***
+    // Wavefunction(s) for diagram number 646
+    // (none)
+    // Amplitude(s) for diagram number 646
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[96], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram647( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 647 OF 1240 ***
+    // Wavefunction(s) for diagram number 647
+    // (none)
+    // Amplitude(s) for diagram number 647
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[102], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram648( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 648 OF 1240 ***
+    // Wavefunction(s) for diagram number 648
+    // (none)
+    // Amplitude(s) for diagram number 648
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram649( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 649 OF 1240 ***
+    // Wavefunction(s) for diagram number 649
+    // (none)
+    // Amplitude(s) for diagram number 649
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram650( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 650 OF 1240 ***
+    // Wavefunction(s) for diagram number 650
+    // (none)
+    // Amplitude(s) for diagram number 650
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram651( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 651 OF 1240 ***
+    // Wavefunction(s) for diagram number 651
+    // (none)
+    // Amplitude(s) for diagram number 651
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram652( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 652 OF 1240 ***
+    // Wavefunction(s) for diagram number 652
+    // (none)
+    // Amplitude(s) for diagram number 652
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram653( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 653 OF 1240 ***
+    // Wavefunction(s) for diagram number 653
+    // (none)
+    // Amplitude(s) for diagram number 653
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram654( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 654 OF 1240 ***
+    // Wavefunction(s) for diagram number 654
+    // (none)
+    // Amplitude(s) for diagram number 654
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram655( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 655 OF 1240 ***
+    // Wavefunction(s) for diagram number 655
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 655
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram656( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 656 OF 1240 ***
+    // Wavefunction(s) for diagram number 656
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[113] );
+    // Amplitude(s) for diagram number 656
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram657( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 657 OF 1240 ***
+    // Wavefunction(s) for diagram number 657
+    // (none)
+    // Amplitude(s) for diagram number 657
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram658( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 658 OF 1240 ***
+    // Wavefunction(s) for diagram number 658
+    // (none)
+    // Amplitude(s) for diagram number 658
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram659( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 659 OF 1240 ***
+    // Wavefunction(s) for diagram number 659
+    // (none)
+    // Amplitude(s) for diagram number 659
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[106], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram660( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 660 OF 1240 ***
+    // Wavefunction(s) for diagram number 660
+    // (none)
+    // Amplitude(s) for diagram number 660
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram661( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 661 OF 1240 ***
+    // Wavefunction(s) for diagram number 661
+    // (none)
+    // Amplitude(s) for diagram number 661
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram662( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 662 OF 1240 ***
+    // Wavefunction(s) for diagram number 662
+    // (none)
+    // Amplitude(s) for diagram number 662
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[96], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram663( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 663 OF 1240 ***
+    // Wavefunction(s) for diagram number 663
+    // (none)
+    // Amplitude(s) for diagram number 663
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram664( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 664 OF 1240 ***
+    // Wavefunction(s) for diagram number 664
+    // (none)
+    // Amplitude(s) for diagram number 664
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram665( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 665 OF 1240 ***
+    // Wavefunction(s) for diagram number 665
+    // (none)
+    // Amplitude(s) for diagram number 665
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram666( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 666 OF 1240 ***
+    // Wavefunction(s) for diagram number 666
+    // (none)
+    // Amplitude(s) for diagram number 666
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram667( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 667 OF 1240 ***
+    // Wavefunction(s) for diagram number 667
+    // (none)
+    // Amplitude(s) for diagram number 667
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram668( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 668 OF 1240 ***
+    // Wavefunction(s) for diagram number 668
+    // (none)
+    // Amplitude(s) for diagram number 668
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram669( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 669 OF 1240 ***
+    // Wavefunction(s) for diagram number 669
+    // (none)
+    // Amplitude(s) for diagram number 669
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram670( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 670 OF 1240 ***
+    // Wavefunction(s) for diagram number 670
+    // (none)
+    // Amplitude(s) for diagram number 670
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram671( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 671 OF 1240 ***
+    // Wavefunction(s) for diagram number 671
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 671
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram672( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 672 OF 1240 ***
+    // Wavefunction(s) for diagram number 672
+    // (none)
+    // Amplitude(s) for diagram number 672
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram673( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 673 OF 1240 ***
+    // Wavefunction(s) for diagram number 673
+    // (none)
+    // Amplitude(s) for diagram number 673
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram674( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 674 OF 1240 ***
+    // Wavefunction(s) for diagram number 674
+    // (none)
+    // Amplitude(s) for diagram number 674
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram675( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 675 OF 1240 ***
+    // Wavefunction(s) for diagram number 675
+    // (none)
+    // Amplitude(s) for diagram number 675
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram676( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 676 OF 1240 ***
+    // Wavefunction(s) for diagram number 676
+    // (none)
+    // Amplitude(s) for diagram number 676
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram677( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 677 OF 1240 ***
+    // Wavefunction(s) for diagram number 677
+    // (none)
+    // Amplitude(s) for diagram number 677
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram678( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 678 OF 1240 ***
+    // Wavefunction(s) for diagram number 678
+    // (none)
+    // Amplitude(s) for diagram number 678
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[96], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram679( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 679 OF 1240 ***
+    // Wavefunction(s) for diagram number 679
+    // (none)
+    // Amplitude(s) for diagram number 679
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram680( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 680 OF 1240 ***
+    // Wavefunction(s) for diagram number 680
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    // Amplitude(s) for diagram number 680
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram681( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 681 OF 1240 ***
+    // Wavefunction(s) for diagram number 681
+    // (none)
+    // Amplitude(s) for diagram number 681
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram682( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 682 OF 1240 ***
+    // Wavefunction(s) for diagram number 682
+    // (none)
+    // Amplitude(s) for diagram number 682
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[104], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram683( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 683 OF 1240 ***
+    // Wavefunction(s) for diagram number 683
+    // (none)
+    // Amplitude(s) for diagram number 683
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram684( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 684 OF 1240 ***
+    // Wavefunction(s) for diagram number 684
+    // (none)
+    // Amplitude(s) for diagram number 684
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram685( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 685 OF 1240 ***
+    // Wavefunction(s) for diagram number 685
+    // (none)
+    // Amplitude(s) for diagram number 685
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[112], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram686( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 686 OF 1240 ***
+    // Wavefunction(s) for diagram number 686
+    // (none)
+    // Amplitude(s) for diagram number 686
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram687( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 687 OF 1240 ***
+    // Wavefunction(s) for diagram number 687
+    // (none)
+    // Amplitude(s) for diagram number 687
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram688( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 688 OF 1240 ***
+    // Wavefunction(s) for diagram number 688
+    // (none)
+    // Amplitude(s) for diagram number 688
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[86], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram689( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 689 OF 1240 ***
+    // Wavefunction(s) for diagram number 689
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[98] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[62] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[101] );
+    // Amplitude(s) for diagram number 689
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram690( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 690 OF 1240 ***
+    // Wavefunction(s) for diagram number 690
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 690
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram691( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 691 OF 1240 ***
+    // Wavefunction(s) for diagram number 691
+    // (none)
+    // Amplitude(s) for diagram number 691
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram692( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 692 OF 1240 ***
+    // Wavefunction(s) for diagram number 692
+    // (none)
+    // Amplitude(s) for diagram number 692
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[1], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram693( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 693 OF 1240 ***
+    // Wavefunction(s) for diagram number 693
+    // (none)
+    // Amplitude(s) for diagram number 693
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram694( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 694 OF 1240 ***
+    // Wavefunction(s) for diagram number 694
+    // (none)
+    // Amplitude(s) for diagram number 694
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[113], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram695( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 695 OF 1240 ***
+    // Wavefunction(s) for diagram number 695
+    // (none)
+    // Amplitude(s) for diagram number 695
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[102], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram696( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 696 OF 1240 ***
+    // Wavefunction(s) for diagram number 696
+    // (none)
+    // Amplitude(s) for diagram number 696
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram697( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 697 OF 1240 ***
+    // Wavefunction(s) for diagram number 697
+    // (none)
+    // Amplitude(s) for diagram number 697
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram698( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 698 OF 1240 ***
+    // Wavefunction(s) for diagram number 698
+    // (none)
+    // Amplitude(s) for diagram number 698
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram699( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 699 OF 1240 ***
+    // Wavefunction(s) for diagram number 699
+    // (none)
+    // Amplitude(s) for diagram number 699
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram700( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 700 OF 1240 ***
+    // Wavefunction(s) for diagram number 700
+    // (none)
+    // Amplitude(s) for diagram number 700
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram701( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 701 OF 1240 ***
+    // Wavefunction(s) for diagram number 701
+    // (none)
+    // Amplitude(s) for diagram number 701
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram702( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 702 OF 1240 ***
+    // Wavefunction(s) for diagram number 702
+    // (none)
+    // Amplitude(s) for diagram number 702
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram703( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 703 OF 1240 ***
+    // Wavefunction(s) for diagram number 703
+    // (none)
+    // Amplitude(s) for diagram number 703
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram704( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 704 OF 1240 ***
+    // Wavefunction(s) for diagram number 704
+    // (none)
+    // Amplitude(s) for diagram number 704
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram705( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 705 OF 1240 ***
+    // Wavefunction(s) for diagram number 705
+    // (none)
+    // Amplitude(s) for diagram number 705
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram706( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 706 OF 1240 ***
+    // Wavefunction(s) for diagram number 706
+    // (none)
+    // Amplitude(s) for diagram number 706
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram707( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 707 OF 1240 ***
+    // Wavefunction(s) for diagram number 707
+    // (none)
+    // Amplitude(s) for diagram number 707
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram708( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 708 OF 1240 ***
+    // Wavefunction(s) for diagram number 708
+    // (none)
+    // Amplitude(s) for diagram number 708
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram709( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 709 OF 1240 ***
+    // Wavefunction(s) for diagram number 709
+    // (none)
+    // Amplitude(s) for diagram number 709
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram710( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 710 OF 1240 ***
+    // Wavefunction(s) for diagram number 710
+    // (none)
+    // Amplitude(s) for diagram number 710
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram711( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 711 OF 1240 ***
+    // Wavefunction(s) for diagram number 711
+    // (none)
+    // Amplitude(s) for diagram number 711
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram712( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 712 OF 1240 ***
+    // Wavefunction(s) for diagram number 712
+    // (none)
+    // Amplitude(s) for diagram number 712
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram713( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 713 OF 1240 ***
+    // Wavefunction(s) for diagram number 713
+    // (none)
+    // Amplitude(s) for diagram number 713
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram714( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 714 OF 1240 ***
+    // Wavefunction(s) for diagram number 714
+    // (none)
+    // Amplitude(s) for diagram number 714
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[106], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram715( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 715 OF 1240 ***
+    // Wavefunction(s) for diagram number 715
+    // (none)
+    // Amplitude(s) for diagram number 715
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[88], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram716( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 716 OF 1240 ***
+    // Wavefunction(s) for diagram number 716
+    // (none)
+    // Amplitude(s) for diagram number 716
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram717( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 717 OF 1240 ***
+    // Wavefunction(s) for diagram number 717
+    // (none)
+    // Amplitude(s) for diagram number 717
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram718( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 718 OF 1240 ***
+    // Wavefunction(s) for diagram number 718
+    // (none)
+    // Amplitude(s) for diagram number 718
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[96], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram719( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 719 OF 1240 ***
+    // Wavefunction(s) for diagram number 719
+    // (none)
+    // Amplitude(s) for diagram number 719
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram720( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 720 OF 1240 ***
+    // Wavefunction(s) for diagram number 720
+    // (none)
+    // Amplitude(s) for diagram number 720
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram721( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 721 OF 1240 ***
+    // Wavefunction(s) for diagram number 721
+    // (none)
+    // Amplitude(s) for diagram number 721
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[86], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram722( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 722 OF 1240 ***
+    // Wavefunction(s) for diagram number 722
+    // (none)
+    // Amplitude(s) for diagram number 722
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram723( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 723 OF 1240 ***
+    // Wavefunction(s) for diagram number 723
+    // (none)
+    // Amplitude(s) for diagram number 723
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram724( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 724 OF 1240 ***
+    // Wavefunction(s) for diagram number 724
+    // (none)
+    // Amplitude(s) for diagram number 724
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram725( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 725 OF 1240 ***
+    // Wavefunction(s) for diagram number 725
+    // (none)
+    // Amplitude(s) for diagram number 725
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[96], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram726( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 726 OF 1240 ***
+    // Wavefunction(s) for diagram number 726
+    // (none)
+    // Amplitude(s) for diagram number 726
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram727( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 727 OF 1240 ***
+    // Wavefunction(s) for diagram number 727
+    // (none)
+    // Amplitude(s) for diagram number 727
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram728( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 728 OF 1240 ***
+    // Wavefunction(s) for diagram number 728
+    // (none)
+    // Amplitude(s) for diagram number 728
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[112], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram729( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 729 OF 1240 ***
+    // Wavefunction(s) for diagram number 729
+    // (none)
+    // Amplitude(s) for diagram number 729
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram730( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 730 OF 1240 ***
+    // Wavefunction(s) for diagram number 730
+    // (none)
+    // Amplitude(s) for diagram number 730
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram731( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 731 OF 1240 ***
+    // Wavefunction(s) for diagram number 731
+    // (none)
+    // Amplitude(s) for diagram number 731
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram732( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 732 OF 1240 ***
+    // Wavefunction(s) for diagram number 732
+    // (none)
+    // Amplitude(s) for diagram number 732
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram733( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 733 OF 1240 ***
+    // Wavefunction(s) for diagram number 733
+    // (none)
+    // Amplitude(s) for diagram number 733
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[96], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram734( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 734 OF 1240 ***
+    // Wavefunction(s) for diagram number 734
+    // (none)
+    // Amplitude(s) for diagram number 734
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram735( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 735 OF 1240 ***
+    // Wavefunction(s) for diagram number 735
+    // (none)
+    // Amplitude(s) for diagram number 735
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram736( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 736 OF 1240 ***
+    // Wavefunction(s) for diagram number 736
+    // (none)
+    // Amplitude(s) for diagram number 736
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[96], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram737( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 737 OF 1240 ***
+    // Wavefunction(s) for diagram number 737
+    // (none)
+    // Amplitude(s) for diagram number 737
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram738( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 738 OF 1240 ***
+    // Wavefunction(s) for diagram number 738
+    // (none)
+    // Amplitude(s) for diagram number 738
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[73], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[79], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[92], w_fp[80], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram739( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 739 OF 1240 ***
+    // Wavefunction(s) for diagram number 739
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[77], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[92] );
+    // Amplitude(s) for diagram number 739
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram740( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 740 OF 1240 ***
+    // Wavefunction(s) for diagram number 740
+    // (none)
+    // Amplitude(s) for diagram number 740
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram741( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 741 OF 1240 ***
+    // Wavefunction(s) for diagram number 741
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[46], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[99] );
+    // Amplitude(s) for diagram number 741
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram742( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 742 OF 1240 ***
+    // Wavefunction(s) for diagram number 742
+    // (none)
+    // Amplitude(s) for diagram number 742
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[85], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram743( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 743 OF 1240 ***
+    // Wavefunction(s) for diagram number 743
+    // (none)
+    // Amplitude(s) for diagram number 743
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram744( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 744 OF 1240 ***
+    // Wavefunction(s) for diagram number 744
+    // (none)
+    // Amplitude(s) for diagram number 744
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram745( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 745 OF 1240 ***
+    // Wavefunction(s) for diagram number 745
+    // (none)
+    // Amplitude(s) for diagram number 745
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[92], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram746( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 746 OF 1240 ***
+    // Wavefunction(s) for diagram number 746
+    // (none)
+    // Amplitude(s) for diagram number 746
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram747( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 747 OF 1240 ***
+    // Wavefunction(s) for diagram number 747
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[29], COUPs[0], 1.0, 0., 0., w_fp[96] );
+    // Amplitude(s) for diagram number 747
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram748( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 748 OF 1240 ***
+    // Wavefunction(s) for diagram number 748
+    // (none)
+    // Amplitude(s) for diagram number 748
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram749( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 749 OF 1240 ***
+    // Wavefunction(s) for diagram number 749
+    // (none)
+    // Amplitude(s) for diagram number 749
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram750( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 750 OF 1240 ***
+    // Wavefunction(s) for diagram number 750
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[38], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[104] );
+    // Amplitude(s) for diagram number 750
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[87], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram751( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 751 OF 1240 ***
+    // Wavefunction(s) for diagram number 751
+    // (none)
+    // Amplitude(s) for diagram number 751
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[85], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram752( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 752 OF 1240 ***
+    // Wavefunction(s) for diagram number 752
+    // (none)
+    // Amplitude(s) for diagram number 752
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram753( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 753 OF 1240 ***
+    // Wavefunction(s) for diagram number 753
+    // (none)
+    // Amplitude(s) for diagram number 753
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram754( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 754 OF 1240 ***
+    // Wavefunction(s) for diagram number 754
+    // (none)
+    // Amplitude(s) for diagram number 754
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[92], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram755( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 755 OF 1240 ***
+    // Wavefunction(s) for diagram number 755
+    // (none)
+    // Amplitude(s) for diagram number 755
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram756( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 756 OF 1240 ***
+    // Wavefunction(s) for diagram number 756
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], COUPs[0], 1.0, 0., 0., w_fp[101] );
+    // Amplitude(s) for diagram number 756
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram757( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 757 OF 1240 ***
+    // Wavefunction(s) for diagram number 757
+    // (none)
+    // Amplitude(s) for diagram number 757
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram758( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 758 OF 1240 ***
+    // Wavefunction(s) for diagram number 758
+    // (none)
+    // Amplitude(s) for diagram number 758
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram759( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 759 OF 1240 ***
+    // Wavefunction(s) for diagram number 759
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[41], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[62] );
+    // Amplitude(s) for diagram number 759
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[87], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram760( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 760 OF 1240 ***
+    // Wavefunction(s) for diagram number 760
+    // (none)
+    // Amplitude(s) for diagram number 760
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram761( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 761 OF 1240 ***
+    // Wavefunction(s) for diagram number 761
+    // (none)
+    // Amplitude(s) for diagram number 761
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram762( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 762 OF 1240 ***
+    // Wavefunction(s) for diagram number 762
+    // (none)
+    // Amplitude(s) for diagram number 762
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram763( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 763 OF 1240 ***
+    // Wavefunction(s) for diagram number 763
+    // (none)
+    // Amplitude(s) for diagram number 763
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[92], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram764( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 764 OF 1240 ***
+    // Wavefunction(s) for diagram number 764
+    // (none)
+    // Amplitude(s) for diagram number 764
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram765( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 765 OF 1240 ***
+    // Wavefunction(s) for diagram number 765
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], COUPs[0], 1.0, 0., 0., w_fp[98] );
+    // Amplitude(s) for diagram number 765
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram766( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 766 OF 1240 ***
+    // Wavefunction(s) for diagram number 766
+    // (none)
+    // Amplitude(s) for diagram number 766
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[92], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram767( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 767 OF 1240 ***
+    // Wavefunction(s) for diagram number 767
+    // (none)
+    // Amplitude(s) for diagram number 767
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram768( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 768 OF 1240 ***
+    // Wavefunction(s) for diagram number 768
+    // (none)
+    // Amplitude(s) for diagram number 768
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[34], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram769( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 769 OF 1240 ***
+    // Wavefunction(s) for diagram number 769
+    // (none)
+    // Amplitude(s) for diagram number 769
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[85], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram770( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 770 OF 1240 ***
+    // Wavefunction(s) for diagram number 770
+    // (none)
+    // Amplitude(s) for diagram number 770
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram771( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 771 OF 1240 ***
+    // Wavefunction(s) for diagram number 771
+    // (none)
+    // Amplitude(s) for diagram number 771
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[85], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram772( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 772 OF 1240 ***
+    // Wavefunction(s) for diagram number 772
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[85] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[112] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[24], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 772
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram773( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 773 OF 1240 ***
+    // Wavefunction(s) for diagram number 773
+    // (none)
+    // Amplitude(s) for diagram number 773
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[92], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram774( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 774 OF 1240 ***
+    // Wavefunction(s) for diagram number 774
+    // (none)
+    // Amplitude(s) for diagram number 774
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram775( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 775 OF 1240 ***
+    // Wavefunction(s) for diagram number 775
+    // (none)
+    // Amplitude(s) for diagram number 775
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[34], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram776( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 776 OF 1240 ***
+    // Wavefunction(s) for diagram number 776
+    // (none)
+    // Amplitude(s) for diagram number 776
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram777( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 777 OF 1240 ***
+    // Wavefunction(s) for diagram number 777
+    // (none)
+    // Amplitude(s) for diagram number 777
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram778( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 778 OF 1240 ***
+    // Wavefunction(s) for diagram number 778
+    // (none)
+    // Amplitude(s) for diagram number 778
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[9], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram779( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 779 OF 1240 ***
+    // Wavefunction(s) for diagram number 779
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[9] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[110] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[27], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[109] );
+    // Amplitude(s) for diagram number 779
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram780( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 780 OF 1240 ***
+    // Wavefunction(s) for diagram number 780
+    // (none)
+    // Amplitude(s) for diagram number 780
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[92], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram781( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 781 OF 1240 ***
+    // Wavefunction(s) for diagram number 781
+    // (none)
+    // Amplitude(s) for diagram number 781
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram782( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 782 OF 1240 ***
+    // Wavefunction(s) for diagram number 782
+    // (none)
+    // Amplitude(s) for diagram number 782
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[34], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram783( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 783 OF 1240 ***
+    // Wavefunction(s) for diagram number 783
+    // (none)
+    // Amplitude(s) for diagram number 783
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[87], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram784( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 784 OF 1240 ***
+    // Wavefunction(s) for diagram number 784
+    // (none)
+    // Amplitude(s) for diagram number 784
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[34], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram785( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 785 OF 1240 ***
+    // Wavefunction(s) for diagram number 785
+    // (none)
+    // Amplitude(s) for diagram number 785
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[87], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram786( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 786 OF 1240 ***
+    // Wavefunction(s) for diagram number 786
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[87] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[34] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[86] );
+    // Amplitude(s) for diagram number 786
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram787( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 787 OF 1240 ***
+    // Wavefunction(s) for diagram number 787
+    // (none)
+    // Amplitude(s) for diagram number 787
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[92], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram788( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 788 OF 1240 ***
+    // Wavefunction(s) for diagram number 788
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[30], COUPs[0], 1.0, 0., 0., w_fp[92] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[31], COUPs[0], 1.0, 0., 0., w_fp[88] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[32], COUPs[0], 1.0, 0., 0., w_fp[106] );
+    // Amplitude(s) for diagram number 788
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram789( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 789 OF 1240 ***
+    // Wavefunction(s) for diagram number 789
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[52], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[90] );
+    // Amplitude(s) for diagram number 789
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[35], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram790( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 790 OF 1240 ***
+    // Wavefunction(s) for diagram number 790
+    // (none)
+    // Amplitude(s) for diagram number 790
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[36], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram791( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 791 OF 1240 ***
+    // Wavefunction(s) for diagram number 791
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[33], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[114] );
+    // Amplitude(s) for diagram number 791
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram792( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 792 OF 1240 ***
+    // Wavefunction(s) for diagram number 792
+    // (none)
+    // Amplitude(s) for diagram number 792
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram793( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 793 OF 1240 ***
+    // Wavefunction(s) for diagram number 793
+    // (none)
+    // Amplitude(s) for diagram number 793
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram794( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 794 OF 1240 ***
+    // Wavefunction(s) for diagram number 794
+    // (none)
+    // Amplitude(s) for diagram number 794
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram795( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 795 OF 1240 ***
+    // Wavefunction(s) for diagram number 795
+    // (none)
+    // Amplitude(s) for diagram number 795
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram796( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 796 OF 1240 ***
+    // Wavefunction(s) for diagram number 796
+    // (none)
+    // Amplitude(s) for diagram number 796
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[114], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram797( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 797 OF 1240 ***
+    // Wavefunction(s) for diagram number 797
+    // (none)
+    // Amplitude(s) for diagram number 797
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram798( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 798 OF 1240 ***
+    // Wavefunction(s) for diagram number 798
+    // (none)
+    // Amplitude(s) for diagram number 798
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[43], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram799( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 799 OF 1240 ***
+    // Wavefunction(s) for diagram number 799
+    // (none)
+    // Amplitude(s) for diagram number 799
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[44], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram800( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 800 OF 1240 ***
+    // Wavefunction(s) for diagram number 800
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[39], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[102] );
+    // Amplitude(s) for diagram number 800
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram801( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 801 OF 1240 ***
+    // Wavefunction(s) for diagram number 801
+    // (none)
+    // Amplitude(s) for diagram number 801
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram802( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 802 OF 1240 ***
+    // Wavefunction(s) for diagram number 802
+    // (none)
+    // Amplitude(s) for diagram number 802
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram803( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 803 OF 1240 ***
+    // Wavefunction(s) for diagram number 803
+    // (none)
+    // Amplitude(s) for diagram number 803
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram804( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 804 OF 1240 ***
+    // Wavefunction(s) for diagram number 804
+    // (none)
+    // Amplitude(s) for diagram number 804
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram805( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 805 OF 1240 ***
+    // Wavefunction(s) for diagram number 805
+    // (none)
+    // Amplitude(s) for diagram number 805
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[102], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram806( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 806 OF 1240 ***
+    // Wavefunction(s) for diagram number 806
+    // (none)
+    // Amplitude(s) for diagram number 806
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram807( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 807 OF 1240 ***
+    // Wavefunction(s) for diagram number 807
+    // (none)
+    // Amplitude(s) for diagram number 807
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[49], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram808( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 808 OF 1240 ***
+    // Wavefunction(s) for diagram number 808
+    // (none)
+    // Amplitude(s) for diagram number 808
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[50], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram809( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 809 OF 1240 ***
+    // Wavefunction(s) for diagram number 809
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[47], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[113] );
+    // Amplitude(s) for diagram number 809
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram810( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 810 OF 1240 ***
+    // Wavefunction(s) for diagram number 810
+    // (none)
+    // Amplitude(s) for diagram number 810
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram811( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 811 OF 1240 ***
+    // Wavefunction(s) for diagram number 811
+    // (none)
+    // Amplitude(s) for diagram number 811
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram812( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 812 OF 1240 ***
+    // Wavefunction(s) for diagram number 812
+    // (none)
+    // Amplitude(s) for diagram number 812
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram813( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 813 OF 1240 ***
+    // Wavefunction(s) for diagram number 813
+    // (none)
+    // Amplitude(s) for diagram number 813
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram814( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 814 OF 1240 ***
+    // Wavefunction(s) for diagram number 814
+    // (none)
+    // Amplitude(s) for diagram number 814
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[113], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram815( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 815 OF 1240 ***
+    // Wavefunction(s) for diagram number 815
+    // (none)
+    // Amplitude(s) for diagram number 815
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram816( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 816 OF 1240 ***
+    // Wavefunction(s) for diagram number 816
+    // (none)
+    // Amplitude(s) for diagram number 816
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram817( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 817 OF 1240 ***
+    // Wavefunction(s) for diagram number 817
+    // (none)
+    // Amplitude(s) for diagram number 817
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram818( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 818 OF 1240 ***
+    // Wavefunction(s) for diagram number 818
+    // (none)
+    // Amplitude(s) for diagram number 818
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[103], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram819( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 819 OF 1240 ***
+    // Wavefunction(s) for diagram number 819
+    // (none)
+    // Amplitude(s) for diagram number 819
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram820( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 820 OF 1240 ***
+    // Wavefunction(s) for diagram number 820
+    // (none)
+    // Amplitude(s) for diagram number 820
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram821( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 821 OF 1240 ***
+    // Wavefunction(s) for diagram number 821
+    // (none)
+    // Amplitude(s) for diagram number 821
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram822( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 822 OF 1240 ***
+    // Wavefunction(s) for diagram number 822
+    // (none)
+    // Amplitude(s) for diagram number 822
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[85], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[112], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram823( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 823 OF 1240 ***
+    // Wavefunction(s) for diagram number 823
+    // (none)
+    // Amplitude(s) for diagram number 823
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[15], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram824( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 824 OF 1240 ***
+    // Wavefunction(s) for diagram number 824
+    // (none)
+    // Amplitude(s) for diagram number 824
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram825( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 825 OF 1240 ***
+    // Wavefunction(s) for diagram number 825
+    // (none)
+    // Amplitude(s) for diagram number 825
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[103], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram826( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 826 OF 1240 ***
+    // Wavefunction(s) for diagram number 826
+    // (none)
+    // Amplitude(s) for diagram number 826
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram827( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 827 OF 1240 ***
+    // Wavefunction(s) for diagram number 827
+    // (none)
+    // Amplitude(s) for diagram number 827
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram828( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 828 OF 1240 ***
+    // Wavefunction(s) for diagram number 828
+    // (none)
+    // Amplitude(s) for diagram number 828
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram829( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 829 OF 1240 ***
+    // Wavefunction(s) for diagram number 829
+    // (none)
+    // Amplitude(s) for diagram number 829
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[110], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[109], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram830( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 830 OF 1240 ***
+    // Wavefunction(s) for diagram number 830
+    // (none)
+    // Amplitude(s) for diagram number 830
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[18], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram831( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 831 OF 1240 ***
+    // Wavefunction(s) for diagram number 831
+    // (none)
+    // Amplitude(s) for diagram number 831
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram832( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 832 OF 1240 ***
+    // Wavefunction(s) for diagram number 832
+    // (none)
+    // Amplitude(s) for diagram number 832
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[103], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram833( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 833 OF 1240 ***
+    // Wavefunction(s) for diagram number 833
+    // (none)
+    // Amplitude(s) for diagram number 833
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram834( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 834 OF 1240 ***
+    // Wavefunction(s) for diagram number 834
+    // (none)
+    // Amplitude(s) for diagram number 834
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[103], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram835( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 835 OF 1240 ***
+    // Wavefunction(s) for diagram number 835
+    // (none)
+    // Amplitude(s) for diagram number 835
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram836( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 836 OF 1240 ***
+    // Wavefunction(s) for diagram number 836
+    // (none)
+    // Amplitude(s) for diagram number 836
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[87], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[34], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[86], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram837( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 837 OF 1240 ***
+    // Wavefunction(s) for diagram number 837
+    // (none)
+    // Amplitude(s) for diagram number 837
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[30], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[31], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[2], w_fp[32], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram838( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 838 OF 1240 ***
+    // Wavefunction(s) for diagram number 838
+    // (none)
+    // Amplitude(s) for diagram number 838
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[92], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[88], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[106], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram839( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 839 OF 1240 ***
+    // Wavefunction(s) for diagram number 839
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], COUPs[0], 1.0, 0., 0., w_fp[90] );
+    // Amplitude(s) for diagram number 839
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram840( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 840 OF 1240 ***
+    // Wavefunction(s) for diagram number 840
+    // (none)
+    // Amplitude(s) for diagram number 840
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[11], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram841( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 841 OF 1240 ***
+    // Wavefunction(s) for diagram number 841
+    // (none)
+    // Amplitude(s) for diagram number 841
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[6], w_fp[90], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram842( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 842 OF 1240 ***
+    // Wavefunction(s) for diagram number 842
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[56] );
+    // Amplitude(s) for diagram number 842
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[63], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram843( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 843 OF 1240 ***
+    // Wavefunction(s) for diagram number 843
+    // (none)
+    // Amplitude(s) for diagram number 843
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[64], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram844( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 844 OF 1240 ***
+    // Wavefunction(s) for diagram number 844
+    // (none)
+    // Amplitude(s) for diagram number 844
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram845( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 845 OF 1240 ***
+    // Wavefunction(s) for diagram number 845
+    // (none)
+    // Amplitude(s) for diagram number 845
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram846( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 846 OF 1240 ***
+    // Wavefunction(s) for diagram number 846
+    // (none)
+    // Amplitude(s) for diagram number 846
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram847( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 847 OF 1240 ***
+    // Wavefunction(s) for diagram number 847
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[103] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[22] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 847
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram848( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 848 OF 1240 ***
+    // Wavefunction(s) for diagram number 848
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    // Amplitude(s) for diagram number 848
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram849( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 849 OF 1240 ***
+    // Wavefunction(s) for diagram number 849
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[115] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[116] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[117] );
+    // Amplitude(s) for diagram number 849
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[6], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram850( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 850 OF 1240 ***
+    // Wavefunction(s) for diagram number 850
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[118] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[119] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[120] );
+    // Amplitude(s) for diagram number 850
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[5], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram851( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 851 OF 1240 ***
+    // Wavefunction(s) for diagram number 851
+    // (none)
+    // Amplitude(s) for diagram number 851
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[61], w_fp[8], w_fp[29], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram852( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 852 OF 1240 ***
+    // Wavefunction(s) for diagram number 852
+    // (none)
+    // Amplitude(s) for diagram number 852
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[29], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram853( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 853 OF 1240 ***
+    // Wavefunction(s) for diagram number 853
+    // (none)
+    // Amplitude(s) for diagram number 853
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[29], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram854( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 854 OF 1240 ***
+    // Wavefunction(s) for diagram number 854
+    // (none)
+    // Amplitude(s) for diagram number 854
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[61], w_fp[8], w_fp[96], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram855( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 855 OF 1240 ***
+    // Wavefunction(s) for diagram number 855
+    // (none)
+    // Amplitude(s) for diagram number 855
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[45], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram856( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 856 OF 1240 ***
+    // Wavefunction(s) for diagram number 856
+    // (none)
+    // Amplitude(s) for diagram number 856
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[44], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram857( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 857 OF 1240 ***
+    // Wavefunction(s) for diagram number 857
+    // (none)
+    // Amplitude(s) for diagram number 857
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram858( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 858 OF 1240 ***
+    // Wavefunction(s) for diagram number 858
+    // (none)
+    // Amplitude(s) for diagram number 858
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram859( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 859 OF 1240 ***
+    // Wavefunction(s) for diagram number 859
+    // (none)
+    // Amplitude(s) for diagram number 859
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram860( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 860 OF 1240 ***
+    // Wavefunction(s) for diagram number 860
+    // (none)
+    // Amplitude(s) for diagram number 860
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram861( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 861 OF 1240 ***
+    // Wavefunction(s) for diagram number 861
+    // (none)
+    // Amplitude(s) for diagram number 861
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram862( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 862 OF 1240 ***
+    // Wavefunction(s) for diagram number 862
+    // (none)
+    // Amplitude(s) for diagram number 862
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram863( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 863 OF 1240 ***
+    // Wavefunction(s) for diagram number 863
+    // (none)
+    // Amplitude(s) for diagram number 863
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[102], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram864( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 864 OF 1240 ***
+    // Wavefunction(s) for diagram number 864
+    // (none)
+    // Amplitude(s) for diagram number 864
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[39], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram865( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 865 OF 1240 ***
+    // Wavefunction(s) for diagram number 865
+    // (none)
+    // Amplitude(s) for diagram number 865
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[51], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram866( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 866 OF 1240 ***
+    // Wavefunction(s) for diagram number 866
+    // (none)
+    // Amplitude(s) for diagram number 866
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[50], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram867( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 867 OF 1240 ***
+    // Wavefunction(s) for diagram number 867
+    // (none)
+    // Amplitude(s) for diagram number 867
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram868( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 868 OF 1240 ***
+    // Wavefunction(s) for diagram number 868
+    // (none)
+    // Amplitude(s) for diagram number 868
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram869( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 869 OF 1240 ***
+    // Wavefunction(s) for diagram number 869
+    // (none)
+    // Amplitude(s) for diagram number 869
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram870( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 870 OF 1240 ***
+    // Wavefunction(s) for diagram number 870
+    // (none)
+    // Amplitude(s) for diagram number 870
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram871( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 871 OF 1240 ***
+    // Wavefunction(s) for diagram number 871
+    // (none)
+    // Amplitude(s) for diagram number 871
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram872( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 872 OF 1240 ***
+    // Wavefunction(s) for diagram number 872
+    // (none)
+    // Amplitude(s) for diagram number 872
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram873( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 873 OF 1240 ***
+    // Wavefunction(s) for diagram number 873
+    // (none)
+    // Amplitude(s) for diagram number 873
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[113], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram874( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 874 OF 1240 ***
+    // Wavefunction(s) for diagram number 874
+    // (none)
+    // Amplitude(s) for diagram number 874
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[47], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram875( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 875 OF 1240 ***
+    // Wavefunction(s) for diagram number 875
+    // (none)
+    // Amplitude(s) for diagram number 875
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[23], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram876( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 876 OF 1240 ***
+    // Wavefunction(s) for diagram number 876
+    // (none)
+    // Amplitude(s) for diagram number 876
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram877( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 877 OF 1240 ***
+    // Wavefunction(s) for diagram number 877
+    // (none)
+    // Amplitude(s) for diagram number 877
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[93], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram878( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 878 OF 1240 ***
+    // Wavefunction(s) for diagram number 878
+    // (none)
+    // Amplitude(s) for diagram number 878
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[64], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram879( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 879 OF 1240 ***
+    // Wavefunction(s) for diagram number 879
+    // (none)
+    // Amplitude(s) for diagram number 879
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram880( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 880 OF 1240 ***
+    // Wavefunction(s) for diagram number 880
+    // (none)
+    // Amplitude(s) for diagram number 880
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[64], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram881( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 881 OF 1240 ***
+    // Wavefunction(s) for diagram number 881
+    // (none)
+    // Amplitude(s) for diagram number 881
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[105], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[95], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[107], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram882( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 882 OF 1240 ***
+    // Wavefunction(s) for diagram number 882
+    // (none)
+    // Amplitude(s) for diagram number 882
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[90], w_fp[20], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram883( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 883 OF 1240 ***
+    // Wavefunction(s) for diagram number 883
+    // (none)
+    // Amplitude(s) for diagram number 883
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram884( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 884 OF 1240 ***
+    // Wavefunction(s) for diagram number 884
+    // (none)
+    // Amplitude(s) for diagram number 884
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[93], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram885( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 885 OF 1240 ***
+    // Wavefunction(s) for diagram number 885
+    // (none)
+    // Amplitude(s) for diagram number 885
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram886( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 886 OF 1240 ***
+    // Wavefunction(s) for diagram number 886
+    // (none)
+    // Amplitude(s) for diagram number 886
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram887( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 887 OF 1240 ***
+    // Wavefunction(s) for diagram number 887
+    // (none)
+    // Amplitude(s) for diagram number 887
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[63], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram888( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 888 OF 1240 ***
+    // Wavefunction(s) for diagram number 888
+    // (none)
+    // Amplitude(s) for diagram number 888
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram889( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 889 OF 1240 ***
+    // Wavefunction(s) for diagram number 889
+    // (none)
+    // Amplitude(s) for diagram number 889
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[18], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram890( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 890 OF 1240 ***
+    // Wavefunction(s) for diagram number 890
+    // (none)
+    // Amplitude(s) for diagram number 890
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram891( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 891 OF 1240 ***
+    // Wavefunction(s) for diagram number 891
+    // (none)
+    // Amplitude(s) for diagram number 891
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[93], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram892( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 892 OF 1240 ***
+    // Wavefunction(s) for diagram number 892
+    // (none)
+    // Amplitude(s) for diagram number 892
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram893( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 893 OF 1240 ***
+    // Wavefunction(s) for diagram number 893
+    // (none)
+    // Amplitude(s) for diagram number 893
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[93], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram894( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 894 OF 1240 ***
+    // Wavefunction(s) for diagram number 894
+    // (none)
+    // Amplitude(s) for diagram number 894
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram895( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 895 OF 1240 ***
+    // Wavefunction(s) for diagram number 895
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], COUPs[0], 1.0, 0., 0., w_fp[65] );
+    // Amplitude(s) for diagram number 895
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[13], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram896( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 896 OF 1240 ***
+    // Wavefunction(s) for diagram number 896
+    // (none)
+    // Amplitude(s) for diagram number 896
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[11], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram897( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 897 OF 1240 ***
+    // Wavefunction(s) for diagram number 897
+    // (none)
+    // Amplitude(s) for diagram number 897
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[6], w_fp[65], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram898( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 898 OF 1240 ***
+    // Wavefunction(s) for diagram number 898
+    // (none)
+    // Amplitude(s) for diagram number 898
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[69], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram899( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 899 OF 1240 ***
+    // Wavefunction(s) for diagram number 899
+    // (none)
+    // Amplitude(s) for diagram number 899
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[70], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram900( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 900 OF 1240 ***
+    // Wavefunction(s) for diagram number 900
+    // (none)
+    // Amplitude(s) for diagram number 900
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram901( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 901 OF 1240 ***
+    // Wavefunction(s) for diagram number 901
+    // (none)
+    // Amplitude(s) for diagram number 901
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram902( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 902 OF 1240 ***
+    // Wavefunction(s) for diagram number 902
+    // (none)
+    // Amplitude(s) for diagram number 902
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram903( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 903 OF 1240 ***
+    // Wavefunction(s) for diagram number 903
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[93] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 903
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram904( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 904 OF 1240 ***
+    // Wavefunction(s) for diagram number 904
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[22] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[103] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[63] );
+    // Amplitude(s) for diagram number 904
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[103], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[63], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram905( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 905 OF 1240 ***
+    // Wavefunction(s) for diagram number 905
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[107] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[95] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[105] );
+    // Amplitude(s) for diagram number 905
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[6], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram906( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 906 OF 1240 ***
+    // Wavefunction(s) for diagram number 906
+    // (none)
+    // Amplitude(s) for diagram number 906
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[4], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram907( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 907 OF 1240 ***
+    // Wavefunction(s) for diagram number 907
+    // (none)
+    // Amplitude(s) for diagram number 907
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[66], w_fp[8], w_fp[27], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram908( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 908 OF 1240 ***
+    // Wavefunction(s) for diagram number 908
+    // (none)
+    // Amplitude(s) for diagram number 908
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[27], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram909( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 909 OF 1240 ***
+    // Wavefunction(s) for diagram number 909
+    // (none)
+    // Amplitude(s) for diagram number 909
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[27], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram910( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 910 OF 1240 ***
+    // Wavefunction(s) for diagram number 910
+    // (none)
+    // Amplitude(s) for diagram number 910
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[8], w_fp[101], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram911( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 911 OF 1240 ***
+    // Wavefunction(s) for diagram number 911
+    // (none)
+    // Amplitude(s) for diagram number 911
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[37], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram912( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 912 OF 1240 ***
+    // Wavefunction(s) for diagram number 912
+    // (none)
+    // Amplitude(s) for diagram number 912
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[36], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram913( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 913 OF 1240 ***
+    // Wavefunction(s) for diagram number 913
+    // (none)
+    // Amplitude(s) for diagram number 913
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram914( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 914 OF 1240 ***
+    // Wavefunction(s) for diagram number 914
+    // (none)
+    // Amplitude(s) for diagram number 914
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram915( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 915 OF 1240 ***
+    // Wavefunction(s) for diagram number 915
+    // (none)
+    // Amplitude(s) for diagram number 915
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram916( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 916 OF 1240 ***
+    // Wavefunction(s) for diagram number 916
+    // (none)
+    // Amplitude(s) for diagram number 916
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram917( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 917 OF 1240 ***
+    // Wavefunction(s) for diagram number 917
+    // (none)
+    // Amplitude(s) for diagram number 917
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram918( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 918 OF 1240 ***
+    // Wavefunction(s) for diagram number 918
+    // (none)
+    // Amplitude(s) for diagram number 918
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram919( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 919 OF 1240 ***
+    // Wavefunction(s) for diagram number 919
+    // (none)
+    // Amplitude(s) for diagram number 919
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[114], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram920( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 920 OF 1240 ***
+    // Wavefunction(s) for diagram number 920
+    // (none)
+    // Amplitude(s) for diagram number 920
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[33], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram921( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 921 OF 1240 ***
+    // Wavefunction(s) for diagram number 921
+    // (none)
+    // Amplitude(s) for diagram number 921
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[51], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram922( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 922 OF 1240 ***
+    // Wavefunction(s) for diagram number 922
+    // (none)
+    // Amplitude(s) for diagram number 922
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[49], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram923( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 923 OF 1240 ***
+    // Wavefunction(s) for diagram number 923
+    // (none)
+    // Amplitude(s) for diagram number 923
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram924( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 924 OF 1240 ***
+    // Wavefunction(s) for diagram number 924
+    // (none)
+    // Amplitude(s) for diagram number 924
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram925( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 925 OF 1240 ***
+    // Wavefunction(s) for diagram number 925
+    // (none)
+    // Amplitude(s) for diagram number 925
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram926( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 926 OF 1240 ***
+    // Wavefunction(s) for diagram number 926
+    // (none)
+    // Amplitude(s) for diagram number 926
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram927( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 927 OF 1240 ***
+    // Wavefunction(s) for diagram number 927
+    // (none)
+    // Amplitude(s) for diagram number 927
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram928( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 928 OF 1240 ***
+    // Wavefunction(s) for diagram number 928
+    // (none)
+    // Amplitude(s) for diagram number 928
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram929( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 929 OF 1240 ***
+    // Wavefunction(s) for diagram number 929
+    // (none)
+    // Amplitude(s) for diagram number 929
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[113], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram930( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 930 OF 1240 ***
+    // Wavefunction(s) for diagram number 930
+    // (none)
+    // Amplitude(s) for diagram number 930
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[47], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram931( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 931 OF 1240 ***
+    // Wavefunction(s) for diagram number 931
+    // (none)
+    // Amplitude(s) for diagram number 931
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[54], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram932( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 932 OF 1240 ***
+    // Wavefunction(s) for diagram number 932
+    // (none)
+    // Amplitude(s) for diagram number 932
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram933( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 933 OF 1240 ***
+    // Wavefunction(s) for diagram number 933
+    // (none)
+    // Amplitude(s) for diagram number 933
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[94], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram934( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 934 OF 1240 ***
+    // Wavefunction(s) for diagram number 934
+    // (none)
+    // Amplitude(s) for diagram number 934
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[70], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram935( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 935 OF 1240 ***
+    // Wavefunction(s) for diagram number 935
+    // (none)
+    // Amplitude(s) for diagram number 935
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram936( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 936 OF 1240 ***
+    // Wavefunction(s) for diagram number 936
+    // (none)
+    // Amplitude(s) for diagram number 936
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[70], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram937( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 937 OF 1240 ***
+    // Wavefunction(s) for diagram number 937
+    // (none)
+    // Amplitude(s) for diagram number 937
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[103], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[63], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram938( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 938 OF 1240 ***
+    // Wavefunction(s) for diagram number 938
+    // (none)
+    // Amplitude(s) for diagram number 938
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[65], w_fp[20], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram939( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 939 OF 1240 ***
+    // Wavefunction(s) for diagram number 939
+    // (none)
+    // Amplitude(s) for diagram number 939
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram940( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 940 OF 1240 ***
+    // Wavefunction(s) for diagram number 940
+    // (none)
+    // Amplitude(s) for diagram number 940
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[94], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram941( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 941 OF 1240 ***
+    // Wavefunction(s) for diagram number 941
+    // (none)
+    // Amplitude(s) for diagram number 941
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram942( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 942 OF 1240 ***
+    // Wavefunction(s) for diagram number 942
+    // (none)
+    // Amplitude(s) for diagram number 942
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram943( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 943 OF 1240 ***
+    // Wavefunction(s) for diagram number 943
+    // (none)
+    // Amplitude(s) for diagram number 943
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[69], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram944( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 944 OF 1240 ***
+    // Wavefunction(s) for diagram number 944
+    // (none)
+    // Amplitude(s) for diagram number 944
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram945( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 945 OF 1240 ***
+    // Wavefunction(s) for diagram number 945
+    // (none)
+    // Amplitude(s) for diagram number 945
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[15], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram946( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 946 OF 1240 ***
+    // Wavefunction(s) for diagram number 946
+    // (none)
+    // Amplitude(s) for diagram number 946
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram947( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 947 OF 1240 ***
+    // Wavefunction(s) for diagram number 947
+    // (none)
+    // Amplitude(s) for diagram number 947
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[94], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram948( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 948 OF 1240 ***
+    // Wavefunction(s) for diagram number 948
+    // (none)
+    // Amplitude(s) for diagram number 948
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram949( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 949 OF 1240 ***
+    // Wavefunction(s) for diagram number 949
+    // (none)
+    // Amplitude(s) for diagram number 949
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[94], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram950( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 950 OF 1240 ***
+    // Wavefunction(s) for diagram number 950
+    // (none)
+    // Amplitude(s) for diagram number 950
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram951( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 951 OF 1240 ***
+    // Wavefunction(s) for diagram number 951
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    // Amplitude(s) for diagram number 951
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[13], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram952( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 952 OF 1240 ***
+    // Wavefunction(s) for diagram number 952
+    // (none)
+    // Amplitude(s) for diagram number 952
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[10], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram953( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 953 OF 1240 ***
+    // Wavefunction(s) for diagram number 953
+    // (none)
+    // Amplitude(s) for diagram number 953
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[5], w_fp[71], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram954( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 954 OF 1240 ***
+    // Wavefunction(s) for diagram number 954
+    // (none)
+    // Amplitude(s) for diagram number 954
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[74], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram955( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 955 OF 1240 ***
+    // Wavefunction(s) for diagram number 955
+    // (none)
+    // Amplitude(s) for diagram number 955
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[75], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram956( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 956 OF 1240 ***
+    // Wavefunction(s) for diagram number 956
+    // (none)
+    // Amplitude(s) for diagram number 956
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram957( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 957 OF 1240 ***
+    // Wavefunction(s) for diagram number 957
+    // (none)
+    // Amplitude(s) for diagram number 957
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram958( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 958 OF 1240 ***
+    // Wavefunction(s) for diagram number 958
+    // (none)
+    // Amplitude(s) for diagram number 958
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram959( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 959 OF 1240 ***
+    // Wavefunction(s) for diagram number 959
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[94] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[65] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 959
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[94], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[65], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram960( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 960 OF 1240 ***
+    // Wavefunction(s) for diagram number 960
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[90] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[93] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[69] );
+    // Amplitude(s) for diagram number 960
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[90], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[93], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[69], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram961( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 961 OF 1240 ***
+    // Wavefunction(s) for diagram number 961
+    // (none)
+    // Amplitude(s) for diagram number 961
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[5], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram962( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 962 OF 1240 ***
+    // Wavefunction(s) for diagram number 962
+    // (none)
+    // Amplitude(s) for diagram number 962
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[4], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram963( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 963 OF 1240 ***
+    // Wavefunction(s) for diagram number 963
+    // (none)
+    // Amplitude(s) for diagram number 963
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[72], w_fp[8], w_fp[24], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram964( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 964 OF 1240 ***
+    // Wavefunction(s) for diagram number 964
+    // (none)
+    // Amplitude(s) for diagram number 964
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[24], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram965( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 965 OF 1240 ***
+    // Wavefunction(s) for diagram number 965
+    // (none)
+    // Amplitude(s) for diagram number 965
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[24], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram966( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 966 OF 1240 ***
+    // Wavefunction(s) for diagram number 966
+    // (none)
+    // Amplitude(s) for diagram number 966
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram967( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 967 OF 1240 ***
+    // Wavefunction(s) for diagram number 967
+    // (none)
+    // Amplitude(s) for diagram number 967
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[37], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram968( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 968 OF 1240 ***
+    // Wavefunction(s) for diagram number 968
+    // (none)
+    // Amplitude(s) for diagram number 968
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[35], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram969( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 969 OF 1240 ***
+    // Wavefunction(s) for diagram number 969
+    // (none)
+    // Amplitude(s) for diagram number 969
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram970( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 970 OF 1240 ***
+    // Wavefunction(s) for diagram number 970
+    // (none)
+    // Amplitude(s) for diagram number 970
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram971( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 971 OF 1240 ***
+    // Wavefunction(s) for diagram number 971
+    // (none)
+    // Amplitude(s) for diagram number 971
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram972( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 972 OF 1240 ***
+    // Wavefunction(s) for diagram number 972
+    // (none)
+    // Amplitude(s) for diagram number 972
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram973( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 973 OF 1240 ***
+    // Wavefunction(s) for diagram number 973
+    // (none)
+    // Amplitude(s) for diagram number 973
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram974( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 974 OF 1240 ***
+    // Wavefunction(s) for diagram number 974
+    // (none)
+    // Amplitude(s) for diagram number 974
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram975( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 975 OF 1240 ***
+    // Wavefunction(s) for diagram number 975
+    // (none)
+    // Amplitude(s) for diagram number 975
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[114], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram976( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 976 OF 1240 ***
+    // Wavefunction(s) for diagram number 976
+    // (none)
+    // Amplitude(s) for diagram number 976
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[33], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram977( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 977 OF 1240 ***
+    // Wavefunction(s) for diagram number 977
+    // (none)
+    // Amplitude(s) for diagram number 977
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[45], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram978( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 978 OF 1240 ***
+    // Wavefunction(s) for diagram number 978
+    // (none)
+    // Amplitude(s) for diagram number 978
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[43], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram979( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 979 OF 1240 ***
+    // Wavefunction(s) for diagram number 979
+    // (none)
+    // Amplitude(s) for diagram number 979
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram980( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 980 OF 1240 ***
+    // Wavefunction(s) for diagram number 980
+    // (none)
+    // Amplitude(s) for diagram number 980
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram981( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 981 OF 1240 ***
+    // Wavefunction(s) for diagram number 981
+    // (none)
+    // Amplitude(s) for diagram number 981
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram982( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 982 OF 1240 ***
+    // Wavefunction(s) for diagram number 982
+    // (none)
+    // Amplitude(s) for diagram number 982
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram983( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 983 OF 1240 ***
+    // Wavefunction(s) for diagram number 983
+    // (none)
+    // Amplitude(s) for diagram number 983
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram984( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 984 OF 1240 ***
+    // Wavefunction(s) for diagram number 984
+    // (none)
+    // Amplitude(s) for diagram number 984
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram985( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 985 OF 1240 ***
+    // Wavefunction(s) for diagram number 985
+    // (none)
+    // Amplitude(s) for diagram number 985
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[102], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram986( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 986 OF 1240 ***
+    // Wavefunction(s) for diagram number 986
+    // (none)
+    // Amplitude(s) for diagram number 986
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[39], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram987( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 987 OF 1240 ***
+    // Wavefunction(s) for diagram number 987
+    // (none)
+    // Amplitude(s) for diagram number 987
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[54], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram988( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 988 OF 1240 ***
+    // Wavefunction(s) for diagram number 988
+    // (none)
+    // Amplitude(s) for diagram number 988
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram989( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 989 OF 1240 ***
+    // Wavefunction(s) for diagram number 989
+    // (none)
+    // Amplitude(s) for diagram number 989
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[97], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram990( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 990 OF 1240 ***
+    // Wavefunction(s) for diagram number 990
+    // (none)
+    // Amplitude(s) for diagram number 990
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[75], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram991( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 991 OF 1240 ***
+    // Wavefunction(s) for diagram number 991
+    // (none)
+    // Amplitude(s) for diagram number 991
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram992( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 992 OF 1240 ***
+    // Wavefunction(s) for diagram number 992
+    // (none)
+    // Amplitude(s) for diagram number 992
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[75], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram993( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 993 OF 1240 ***
+    // Wavefunction(s) for diagram number 993
+    // (none)
+    // Amplitude(s) for diagram number 993
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[90], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[93], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[69], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram994( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 994 OF 1240 ***
+    // Wavefunction(s) for diagram number 994
+    // (none)
+    // Amplitude(s) for diagram number 994
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[23], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram995( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 995 OF 1240 ***
+    // Wavefunction(s) for diagram number 995
+    // (none)
+    // Amplitude(s) for diagram number 995
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram996( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 996 OF 1240 ***
+    // Wavefunction(s) for diagram number 996
+    // (none)
+    // Amplitude(s) for diagram number 996
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[97], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram997( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 997 OF 1240 ***
+    // Wavefunction(s) for diagram number 997
+    // (none)
+    // Amplitude(s) for diagram number 997
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[74], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram998( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 998 OF 1240 ***
+    // Wavefunction(s) for diagram number 998
+    // (none)
+    // Amplitude(s) for diagram number 998
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram999( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 999 OF 1240 ***
+    // Wavefunction(s) for diagram number 999
+    // (none)
+    // Amplitude(s) for diagram number 999
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[74], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1000( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1000 OF 1240 ***
+    // Wavefunction(s) for diagram number 1000
+    // (none)
+    // Amplitude(s) for diagram number 1000
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[94], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[65], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1001( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1001 OF 1240 ***
+    // Wavefunction(s) for diagram number 1001
+    // (none)
+    // Amplitude(s) for diagram number 1001
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1002( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1002 OF 1240 ***
+    // Wavefunction(s) for diagram number 1002
+    // (none)
+    // Amplitude(s) for diagram number 1002
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1003( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1003 OF 1240 ***
+    // Wavefunction(s) for diagram number 1003
+    // (none)
+    // Amplitude(s) for diagram number 1003
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1004( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1004 OF 1240 ***
+    // Wavefunction(s) for diagram number 1004
+    // (none)
+    // Amplitude(s) for diagram number 1004
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1005( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1005 OF 1240 ***
+    // Wavefunction(s) for diagram number 1005
+    // (none)
+    // Amplitude(s) for diagram number 1005
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[97], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1006( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1006 OF 1240 ***
+    // Wavefunction(s) for diagram number 1006
+    // (none)
+    // Amplitude(s) for diagram number 1006
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[76], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1007( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1007 OF 1240 ***
+    // Wavefunction(s) for diagram number 1007
+    // (none)
+    // Amplitude(s) for diagram number 1007
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[59], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1008( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1008 OF 1240 ***
+    // Wavefunction(s) for diagram number 1008
+    // (none)
+    // Amplitude(s) for diagram number 1008
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1009( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1009 OF 1240 ***
+    // Wavefunction(s) for diagram number 1009
+    // (none)
+    // Amplitude(s) for diagram number 1009
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[6], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1010( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1010 OF 1240 ***
+    // Wavefunction(s) for diagram number 1010
+    // (none)
+    // Amplitude(s) for diagram number 1010
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[108], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1011( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1011 OF 1240 ***
+    // Wavefunction(s) for diagram number 1011
+    // (none)
+    // Amplitude(s) for diagram number 1011
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1012( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1012 OF 1240 ***
+    // Wavefunction(s) for diagram number 1012
+    // (none)
+    // Amplitude(s) for diagram number 1012
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], w_fp[98], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1013( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1013 OF 1240 ***
+    // Wavefunction(s) for diagram number 1013
+    // (none)
+    // Amplitude(s) for diagram number 1013
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1014( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1014 OF 1240 ***
+    // Wavefunction(s) for diagram number 1014
+    // (none)
+    // Amplitude(s) for diagram number 1014
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1015( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1015 OF 1240 ***
+    // Wavefunction(s) for diagram number 1015
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[11] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[42] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], COUPs[2], 1.0, 0., 0., w_fp[76] );
+    // Amplitude(s) for diagram number 1015
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1016( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1016 OF 1240 ***
+    // Wavefunction(s) for diagram number 1016
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[97] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[71] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[24], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 1016
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[97], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1017( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1017 OF 1240 ***
+    // Wavefunction(s) for diagram number 1017
+    // (none)
+    // Amplitude(s) for diagram number 1017
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[118], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[119], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[24], w_fp[120], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1018( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1018 OF 1240 ***
+    // Wavefunction(s) for diagram number 1018
+    // (none)
+    // Amplitude(s) for diagram number 1018
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[85], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[112], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1019( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1019 OF 1240 ***
+    // Wavefunction(s) for diagram number 1019
+    // (none)
+    // Amplitude(s) for diagram number 1019
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[68], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1020( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1020 OF 1240 ***
+    // Wavefunction(s) for diagram number 1020
+    // (none)
+    // Amplitude(s) for diagram number 1020
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1021( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1021 OF 1240 ***
+    // Wavefunction(s) for diagram number 1021
+    // (none)
+    // Amplitude(s) for diagram number 1021
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[5], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1022( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1022 OF 1240 ***
+    // Wavefunction(s) for diagram number 1022
+    // (none)
+    // Amplitude(s) for diagram number 1022
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[108], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1023( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1023 OF 1240 ***
+    // Wavefunction(s) for diagram number 1023
+    // (none)
+    // Amplitude(s) for diagram number 1023
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1024( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1024 OF 1240 ***
+    // Wavefunction(s) for diagram number 1024
+    // (none)
+    // Amplitude(s) for diagram number 1024
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[5], w_fp[101], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1025( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1025 OF 1240 ***
+    // Wavefunction(s) for diagram number 1025
+    // (none)
+    // Amplitude(s) for diagram number 1025
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1026( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1026 OF 1240 ***
+    // Wavefunction(s) for diagram number 1026
+    // (none)
+    // Amplitude(s) for diagram number 1026
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1027( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1027 OF 1240 ***
+    // Wavefunction(s) for diagram number 1027
+    // (none)
+    // Amplitude(s) for diagram number 1027
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1028( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1028 OF 1240 ***
+    // Wavefunction(s) for diagram number 1028
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[16] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[27], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 1028
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1029( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1029 OF 1240 ***
+    // Wavefunction(s) for diagram number 1029
+    // (none)
+    // Amplitude(s) for diagram number 1029
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[115], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[116], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[27], w_fp[117], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1030( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1030 OF 1240 ***
+    // Wavefunction(s) for diagram number 1030
+    // (none)
+    // Amplitude(s) for diagram number 1030
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[110], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[109], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1031( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1031 OF 1240 ***
+    // Wavefunction(s) for diagram number 1031
+    // (none)
+    // Amplitude(s) for diagram number 1031
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[67], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1032( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1032 OF 1240 ***
+    // Wavefunction(s) for diagram number 1032
+    // (none)
+    // Amplitude(s) for diagram number 1032
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[56], w_fp[1], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1033( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1033 OF 1240 ***
+    // Wavefunction(s) for diagram number 1033
+    // (none)
+    // Amplitude(s) for diagram number 1033
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[29], w_fp[56], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1034( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1034 OF 1240 ***
+    // Wavefunction(s) for diagram number 1034
+    // (none)
+    // Amplitude(s) for diagram number 1034
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[108], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1035( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1035 OF 1240 ***
+    // Wavefunction(s) for diagram number 1035
+    // (none)
+    // Amplitude(s) for diagram number 1035
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1036( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1036 OF 1240 ***
+    // Wavefunction(s) for diagram number 1036
+    // (none)
+    // Amplitude(s) for diagram number 1036
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[4], w_fp[96], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1037( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1037 OF 1240 ***
+    // Wavefunction(s) for diagram number 1037
+    // (none)
+    // Amplitude(s) for diagram number 1037
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[108], w_fp[19], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1038( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1038 OF 1240 ***
+    // Wavefunction(s) for diagram number 1038
+    // (none)
+    // Amplitude(s) for diagram number 1038
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[13], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1039( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1039 OF 1240 ***
+    // Wavefunction(s) for diagram number 1039
+    // (none)
+    // Amplitude(s) for diagram number 1039
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[29], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1040( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1040 OF 1240 ***
+    // Wavefunction(s) for diagram number 1040
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[76] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[42] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[29], COUPs[2], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 1040
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[76], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[42], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1041( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1041 OF 1240 ***
+    // Wavefunction(s) for diagram number 1041
+    // (none)
+    // Amplitude(s) for diagram number 1041
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[107], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[95], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[29], w_fp[105], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1042( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1042 OF 1240 ***
+    // Wavefunction(s) for diagram number 1042
+    // (none)
+    // Amplitude(s) for diagram number 1042
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[87], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[34], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[86], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1043( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1043 OF 1240 ***
+    // Wavefunction(s) for diagram number 1043
+    // (none)
+    // Amplitude(s) for diagram number 1043
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[30], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[31], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[8], w_fp[32], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1044( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1044 OF 1240 ***
+    // Wavefunction(s) for diagram number 1044
+    // (none)
+    // Amplitude(s) for diagram number 1044
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[30], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[31], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[32], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1045( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1045 OF 1240 ***
+    // Wavefunction(s) for diagram number 1045
+    // (none)
+    // Amplitude(s) for diagram number 1045
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[92], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[88], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[106], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1046( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1046 OF 1240 ***
+    // Wavefunction(s) for diagram number 1046
+    // (none)
+    // Amplitude(s) for diagram number 1046
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[114], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1047( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1047 OF 1240 ***
+    // Wavefunction(s) for diagram number 1047
+    // (none)
+    // Amplitude(s) for diagram number 1047
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1048( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1048 OF 1240 ***
+    // Wavefunction(s) for diagram number 1048
+    // (none)
+    // Amplitude(s) for diagram number 1048
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[100], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1049( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1049 OF 1240 ***
+    // Wavefunction(s) for diagram number 1049
+    // (none)
+    // Amplitude(s) for diagram number 1049
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[36], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1050( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1050 OF 1240 ***
+    // Wavefunction(s) for diagram number 1050
+    // (none)
+    // Amplitude(s) for diagram number 1050
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[48], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1051( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1051 OF 1240 ***
+    // Wavefunction(s) for diagram number 1051
+    // (none)
+    // Amplitude(s) for diagram number 1051
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[36], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1052( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1052 OF 1240 ***
+    // Wavefunction(s) for diagram number 1052
+    // (none)
+    // Amplitude(s) for diagram number 1052
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[114], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1053( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1053 OF 1240 ***
+    // Wavefunction(s) for diagram number 1053
+    // (none)
+    // Amplitude(s) for diagram number 1053
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1054( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1054 OF 1240 ***
+    // Wavefunction(s) for diagram number 1054
+    // (none)
+    // Amplitude(s) for diagram number 1054
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[100], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1055( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1055 OF 1240 ***
+    // Wavefunction(s) for diagram number 1055
+    // (none)
+    // Amplitude(s) for diagram number 1055
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[35], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1056( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1056 OF 1240 ***
+    // Wavefunction(s) for diagram number 1056
+    // (none)
+    // Amplitude(s) for diagram number 1056
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[40], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1057( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1057 OF 1240 ***
+    // Wavefunction(s) for diagram number 1057
+    // (none)
+    // Amplitude(s) for diagram number 1057
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[35], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1058( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1058 OF 1240 ***
+    // Wavefunction(s) for diagram number 1058
+    // (none)
+    // Amplitude(s) for diagram number 1058
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1059( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1059 OF 1240 ***
+    // Wavefunction(s) for diagram number 1059
+    // (none)
+    // Amplitude(s) for diagram number 1059
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[114], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1060( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1060 OF 1240 ***
+    // Wavefunction(s) for diagram number 1060
+    // (none)
+    // Amplitude(s) for diagram number 1060
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[100], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1061( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1061 OF 1240 ***
+    // Wavefunction(s) for diagram number 1061
+    // (none)
+    // Amplitude(s) for diagram number 1061
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1062( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1062 OF 1240 ***
+    // Wavefunction(s) for diagram number 1062
+    // (none)
+    // Amplitude(s) for diagram number 1062
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[100], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1063( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1063 OF 1240 ***
+    // Wavefunction(s) for diagram number 1063
+    // (none)
+    // Amplitude(s) for diagram number 1063
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[37], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1064( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1064 OF 1240 ***
+    // Wavefunction(s) for diagram number 1064
+    // (none)
+    // Amplitude(s) for diagram number 1064
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1065( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1065 OF 1240 ***
+    // Wavefunction(s) for diagram number 1065
+    // (none)
+    // Amplitude(s) for diagram number 1065
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[102], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1066( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1066 OF 1240 ***
+    // Wavefunction(s) for diagram number 1066
+    // (none)
+    // Amplitude(s) for diagram number 1066
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1067( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1067 OF 1240 ***
+    // Wavefunction(s) for diagram number 1067
+    // (none)
+    // Amplitude(s) for diagram number 1067
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[89], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1068( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1068 OF 1240 ***
+    // Wavefunction(s) for diagram number 1068
+    // (none)
+    // Amplitude(s) for diagram number 1068
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[44], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1069( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1069 OF 1240 ***
+    // Wavefunction(s) for diagram number 1069
+    // (none)
+    // Amplitude(s) for diagram number 1069
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[53], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1070( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1070 OF 1240 ***
+    // Wavefunction(s) for diagram number 1070
+    // (none)
+    // Amplitude(s) for diagram number 1070
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[44], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1071( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1071 OF 1240 ***
+    // Wavefunction(s) for diagram number 1071
+    // (none)
+    // Amplitude(s) for diagram number 1071
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[102], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1072( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1072 OF 1240 ***
+    // Wavefunction(s) for diagram number 1072
+    // (none)
+    // Amplitude(s) for diagram number 1072
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1073( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1073 OF 1240 ***
+    // Wavefunction(s) for diagram number 1073
+    // (none)
+    // Amplitude(s) for diagram number 1073
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[89], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1074( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1074 OF 1240 ***
+    // Wavefunction(s) for diagram number 1074
+    // (none)
+    // Amplitude(s) for diagram number 1074
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[43], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1075( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1075 OF 1240 ***
+    // Wavefunction(s) for diagram number 1075
+    // (none)
+    // Amplitude(s) for diagram number 1075
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[28], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1076( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1076 OF 1240 ***
+    // Wavefunction(s) for diagram number 1076
+    // (none)
+    // Amplitude(s) for diagram number 1076
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[43], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1077( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1077 OF 1240 ***
+    // Wavefunction(s) for diagram number 1077
+    // (none)
+    // Amplitude(s) for diagram number 1077
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1078( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1078 OF 1240 ***
+    // Wavefunction(s) for diagram number 1078
+    // (none)
+    // Amplitude(s) for diagram number 1078
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[102], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1079( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1079 OF 1240 ***
+    // Wavefunction(s) for diagram number 1079
+    // (none)
+    // Amplitude(s) for diagram number 1079
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[89], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1080( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1080 OF 1240 ***
+    // Wavefunction(s) for diagram number 1080
+    // (none)
+    // Amplitude(s) for diagram number 1080
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1081( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1081 OF 1240 ***
+    // Wavefunction(s) for diagram number 1081
+    // (none)
+    // Amplitude(s) for diagram number 1081
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[89], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1082( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1082 OF 1240 ***
+    // Wavefunction(s) for diagram number 1082
+    // (none)
+    // Amplitude(s) for diagram number 1082
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[45], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1083( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1083 OF 1240 ***
+    // Wavefunction(s) for diagram number 1083
+    // (none)
+    // Amplitude(s) for diagram number 1083
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1084( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1084 OF 1240 ***
+    // Wavefunction(s) for diagram number 1084
+    // (none)
+    // Amplitude(s) for diagram number 1084
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[113], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1085( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1085 OF 1240 ***
+    // Wavefunction(s) for diagram number 1085
+    // (none)
+    // Amplitude(s) for diagram number 1085
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1086( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1086 OF 1240 ***
+    // Wavefunction(s) for diagram number 1086
+    // (none)
+    // Amplitude(s) for diagram number 1086
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[91], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1087( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1087 OF 1240 ***
+    // Wavefunction(s) for diagram number 1087
+    // (none)
+    // Amplitude(s) for diagram number 1087
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[50], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1088( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1088 OF 1240 ***
+    // Wavefunction(s) for diagram number 1088
+    // (none)
+    // Amplitude(s) for diagram number 1088
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1089( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1089 OF 1240 ***
+    // Wavefunction(s) for diagram number 1089
+    // (none)
+    // Amplitude(s) for diagram number 1089
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[50], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1090( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1090 OF 1240 ***
+    // Wavefunction(s) for diagram number 1090
+    // (none)
+    // Amplitude(s) for diagram number 1090
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[113], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1091( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1091 OF 1240 ***
+    // Wavefunction(s) for diagram number 1091
+    // (none)
+    // Amplitude(s) for diagram number 1091
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1092( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1092 OF 1240 ***
+    // Wavefunction(s) for diagram number 1092
+    // (none)
+    // Amplitude(s) for diagram number 1092
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[91], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1093( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1093 OF 1240 ***
+    // Wavefunction(s) for diagram number 1093
+    // (none)
+    // Amplitude(s) for diagram number 1093
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[49], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1094( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1094 OF 1240 ***
+    // Wavefunction(s) for diagram number 1094
+    // (none)
+    // Amplitude(s) for diagram number 1094
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[25], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1095( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1095 OF 1240 ***
+    // Wavefunction(s) for diagram number 1095
+    // (none)
+    // Amplitude(s) for diagram number 1095
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[49], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1096( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1096 OF 1240 ***
+    // Wavefunction(s) for diagram number 1096
+    // (none)
+    // Amplitude(s) for diagram number 1096
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1097( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1097 OF 1240 ***
+    // Wavefunction(s) for diagram number 1097
+    // (none)
+    // Amplitude(s) for diagram number 1097
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[113], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1098( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1098 OF 1240 ***
+    // Wavefunction(s) for diagram number 1098
+    // (none)
+    // Amplitude(s) for diagram number 1098
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[91], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1099( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1099 OF 1240 ***
+    // Wavefunction(s) for diagram number 1099
+    // (none)
+    // Amplitude(s) for diagram number 1099
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1100( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1100 OF 1240 ***
+    // Wavefunction(s) for diagram number 1100
+    // (none)
+    // Amplitude(s) for diagram number 1100
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[26], w_fp[91], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1101( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1101 OF 1240 ***
+    // Wavefunction(s) for diagram number 1101
+    // (none)
+    // Amplitude(s) for diagram number 1101
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[51], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1102( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1102 OF 1240 ***
+    // Wavefunction(s) for diagram number 1102
+    // (none)
+    // Amplitude(s) for diagram number 1102
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1103( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1103 OF 1240 ***
+    // Wavefunction(s) for diagram number 1103
+    // (none)
+    // Amplitude(s) for diagram number 1103
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[67], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1104( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1104 OF 1240 ***
+    // Wavefunction(s) for diagram number 1104
+    // (none)
+    // Amplitude(s) for diagram number 1104
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[18], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1105( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1105 OF 1240 ***
+    // Wavefunction(s) for diagram number 1105
+    // (none)
+    // Amplitude(s) for diagram number 1105
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[2], w_fp[96], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1106( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1106 OF 1240 ***
+    // Wavefunction(s) for diagram number 1106
+    // (none)
+    // Amplitude(s) for diagram number 1106
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[96], w_fp[1], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1107( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1107 OF 1240 ***
+    // Wavefunction(s) for diagram number 1107
+    // (none)
+    // Amplitude(s) for diagram number 1107
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[78], w_fp[18], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1108( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1108 OF 1240 ***
+    // Wavefunction(s) for diagram number 1108
+    // (none)
+    // Amplitude(s) for diagram number 1108
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[67], w_fp[54], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1109( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1109 OF 1240 ***
+    // Wavefunction(s) for diagram number 1109
+    // (none)
+    // Amplitude(s) for diagram number 1109
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[76], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[42], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1110( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1110 OF 1240 ***
+    // Wavefunction(s) for diagram number 1110
+    // (none)
+    // Amplitude(s) for diagram number 1110
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1111( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1111 OF 1240 ***
+    // Wavefunction(s) for diagram number 1111
+    // (none)
+    // Amplitude(s) for diagram number 1111
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[15], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1112( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1112 OF 1240 ***
+    // Wavefunction(s) for diagram number 1112
+    // (none)
+    // Amplitude(s) for diagram number 1112
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[2], w_fp[101], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1113( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1113 OF 1240 ***
+    // Wavefunction(s) for diagram number 1113
+    // (none)
+    // Amplitude(s) for diagram number 1113
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[101], w_fp[1], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1114( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1114 OF 1240 ***
+    // Wavefunction(s) for diagram number 1114
+    // (none)
+    // Amplitude(s) for diagram number 1114
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[58], w_fp[15], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1115( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1115 OF 1240 ***
+    // Wavefunction(s) for diagram number 1115
+    // (none)
+    // Amplitude(s) for diagram number 1115
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[68], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1116( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1116 OF 1240 ***
+    // Wavefunction(s) for diagram number 1116
+    // (none)
+    // Amplitude(s) for diagram number 1116
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1117( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1117 OF 1240 ***
+    // Wavefunction(s) for diagram number 1117
+    // (none)
+    // Amplitude(s) for diagram number 1117
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1118( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1118 OF 1240 ***
+    // Wavefunction(s) for diagram number 1118
+    // (none)
+    // Amplitude(s) for diagram number 1118
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[17], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1119( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1119 OF 1240 ***
+    // Wavefunction(s) for diagram number 1119
+    // (none)
+    // Amplitude(s) for diagram number 1119
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1120( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1120 OF 1240 ***
+    // Wavefunction(s) for diagram number 1120
+    // (none)
+    // Amplitude(s) for diagram number 1120
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[1], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1121( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1121 OF 1240 ***
+    // Wavefunction(s) for diagram number 1121
+    // (none)
+    // Amplitude(s) for diagram number 1121
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[17], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1122( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1122 OF 1240 ***
+    // Wavefunction(s) for diagram number 1122
+    // (none)
+    // Amplitude(s) for diagram number 1122
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[59], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1123( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1123 OF 1240 ***
+    // Wavefunction(s) for diagram number 1123
+    // (none)
+    // Amplitude(s) for diagram number 1123
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1124( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1124 OF 1240 ***
+    // Wavefunction(s) for diagram number 1124
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[71] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[97] );
+    // Amplitude(s) for diagram number 1124
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[5], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1125( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1125 OF 1240 ***
+    // Wavefunction(s) for diagram number 1125
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[59] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    // Amplitude(s) for diagram number 1125
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1126( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1126 OF 1240 ***
+    // Wavefunction(s) for diagram number 1126
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[17] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[98] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[97], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 1126
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[111], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1127( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1127 OF 1240 ***
+    // Wavefunction(s) for diagram number 1127
+    // (none)
+    // Amplitude(s) for diagram number 1127
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[97], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1128( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1128 OF 1240 ***
+    // Wavefunction(s) for diagram number 1128
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+    // Amplitude(s) for diagram number 1128
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[39], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1129( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1129 OF 1240 ***
+    // Wavefunction(s) for diagram number 1129
+    // (none)
+    // Amplitude(s) for diagram number 1129
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1130( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1130 OF 1240 ***
+    // Wavefunction(s) for diagram number 1130
+    // (none)
+    // Amplitude(s) for diagram number 1130
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[39], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1131( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1131 OF 1240 ***
+    // Wavefunction(s) for diagram number 1131
+    // (none)
+    // Amplitude(s) for diagram number 1131
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[47], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1132( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1132 OF 1240 ***
+    // Wavefunction(s) for diagram number 1132
+    // (none)
+    // Amplitude(s) for diagram number 1132
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1133( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1133 OF 1240 ***
+    // Wavefunction(s) for diagram number 1133
+    // (none)
+    // Amplitude(s) for diagram number 1133
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[47], w_fp[97], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1134( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1134 OF 1240 ***
+    // Wavefunction(s) for diagram number 1134
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[97], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    // Amplitude(s) for diagram number 1134
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1135( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1135 OF 1240 ***
+    // Wavefunction(s) for diagram number 1135
+    // (none)
+    // Amplitude(s) for diagram number 1135
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1136( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1136 OF 1240 ***
+    // Wavefunction(s) for diagram number 1136
+    // (none)
+    // Amplitude(s) for diagram number 1136
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1137( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1137 OF 1240 ***
+    // Wavefunction(s) for diagram number 1137
+    // (none)
+    // Amplitude(s) for diagram number 1137
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1138( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1138 OF 1240 ***
+    // Wavefunction(s) for diagram number 1138
+    // (none)
+    // Amplitude(s) for diagram number 1138
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1139( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1139 OF 1240 ***
+    // Wavefunction(s) for diagram number 1139
+    // (none)
+    // Amplitude(s) for diagram number 1139
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1140( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1140 OF 1240 ***
+    // Wavefunction(s) for diagram number 1140
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[68] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[29] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 1140
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[4], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1141( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1141 OF 1240 ***
+    // Wavefunction(s) for diagram number 1141
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 1141
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1142( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1142 OF 1240 ***
+    // Wavefunction(s) for diagram number 1142
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[20] );
+    // Amplitude(s) for diagram number 1142
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[20], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1143( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1143 OF 1240 ***
+    // Wavefunction(s) for diagram number 1143
+    // (none)
+    // Amplitude(s) for diagram number 1143
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1144( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1144 OF 1240 ***
+    // Wavefunction(s) for diagram number 1144
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[111] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 1144
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[33], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1145( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1145 OF 1240 ***
+    // Wavefunction(s) for diagram number 1145
+    // (none)
+    // Amplitude(s) for diagram number 1145
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1146( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1146 OF 1240 ***
+    // Wavefunction(s) for diagram number 1146
+    // (none)
+    // Amplitude(s) for diagram number 1146
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[33], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1147( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1147 OF 1240 ***
+    // Wavefunction(s) for diagram number 1147
+    // (none)
+    // Amplitude(s) for diagram number 1147
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1148( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1148 OF 1240 ***
+    // Wavefunction(s) for diagram number 1148
+    // (none)
+    // Amplitude(s) for diagram number 1148
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1149( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1149 OF 1240 ***
+    // Wavefunction(s) for diagram number 1149
+    // (none)
+    // Amplitude(s) for diagram number 1149
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[47], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1150( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1150 OF 1240 ***
+    // Wavefunction(s) for diagram number 1150
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[10], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
+    // Amplitude(s) for diagram number 1150
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[17], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[68], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[29], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1151( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1151 OF 1240 ***
+    // Wavefunction(s) for diagram number 1151
+    // (none)
+    // Amplitude(s) for diagram number 1151
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1152( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1152 OF 1240 ***
+    // Wavefunction(s) for diagram number 1152
+    // (none)
+    // Amplitude(s) for diagram number 1152
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[17], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[68], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[29], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1153( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1153 OF 1240 ***
+    // Wavefunction(s) for diagram number 1153
+    // (none)
+    // Amplitude(s) for diagram number 1153
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1154( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1154 OF 1240 ***
+    // Wavefunction(s) for diagram number 1154
+    // (none)
+    // Amplitude(s) for diagram number 1154
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[17], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1155( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1155 OF 1240 ***
+    // Wavefunction(s) for diagram number 1155
+    // (none)
+    // Amplitude(s) for diagram number 1155
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1156( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1156 OF 1240 ***
+    // Wavefunction(s) for diagram number 1156
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[98] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[27] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[111] );
+    // Amplitude(s) for diagram number 1156
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1157( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1157 OF 1240 ***
+    // Wavefunction(s) for diagram number 1157
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[59] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[29] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 1157
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[59], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1158( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1158 OF 1240 ***
+    // Wavefunction(s) for diagram number 1158
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[98], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[17] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[27], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[111], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    // Amplitude(s) for diagram number 1158
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1159( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1159 OF 1240 ***
+    // Wavefunction(s) for diagram number 1159
+    // (none)
+    // Amplitude(s) for diagram number 1159
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[111], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1160( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1160 OF 1240 ***
+    // Wavefunction(s) for diagram number 1160
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    // Amplitude(s) for diagram number 1160
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[33], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1161( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1161 OF 1240 ***
+    // Wavefunction(s) for diagram number 1161
+    // (none)
+    // Amplitude(s) for diagram number 1161
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1162( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1162 OF 1240 ***
+    // Wavefunction(s) for diagram number 1162
+    // (none)
+    // Amplitude(s) for diagram number 1162
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[33], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1163( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1163 OF 1240 ***
+    // Wavefunction(s) for diagram number 1163
+    // (none)
+    // Amplitude(s) for diagram number 1163
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1164( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1164 OF 1240 ***
+    // Wavefunction(s) for diagram number 1164
+    // (none)
+    // Amplitude(s) for diagram number 1164
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1165( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1165 OF 1240 ***
+    // Wavefunction(s) for diagram number 1165
+    // (none)
+    // Amplitude(s) for diagram number 1165
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[39], w_fp[111], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1166( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1166 OF 1240 ***
+    // Wavefunction(s) for diagram number 1166
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[98], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[27], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[111], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+    // Amplitude(s) for diagram number 1166
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[98], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1167( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1167 OF 1240 ***
+    // Wavefunction(s) for diagram number 1167
+    // (none)
+    // Amplitude(s) for diagram number 1167
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1168( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1168 OF 1240 ***
+    // Wavefunction(s) for diagram number 1168
+    // (none)
+    // Amplitude(s) for diagram number 1168
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[98], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1169( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1169 OF 1240 ***
+    // Wavefunction(s) for diagram number 1169
+    // (none)
+    // Amplitude(s) for diagram number 1169
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1170( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1170 OF 1240 ***
+    // Wavefunction(s) for diagram number 1170
+    // (none)
+    // Amplitude(s) for diagram number 1170
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[98], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1171( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1171 OF 1240 ***
+    // Wavefunction(s) for diagram number 1171
+    // (none)
+    // Amplitude(s) for diagram number 1171
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1172( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1172 OF 1240 ***
+    // Wavefunction(s) for diagram number 1172
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[60] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[20] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[98] );
+    // Amplitude(s) for diagram number 1172
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[77], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1173( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1173 OF 1240 ***
+    // Wavefunction(s) for diagram number 1173
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[29] );
+    // Amplitude(s) for diagram number 1173
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1174( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1174 OF 1240 ***
+    // Wavefunction(s) for diagram number 1174
+    // (none)
+    // Amplitude(s) for diagram number 1174
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[77], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1175( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1175 OF 1240 ***
+    // Wavefunction(s) for diagram number 1175
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[60], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[59] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[24], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 1175
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[59], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[71], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[21], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1176( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1176 OF 1240 ***
+    // Wavefunction(s) for diagram number 1176
+    // (none)
+    // Amplitude(s) for diagram number 1176
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1177( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1177 OF 1240 ***
+    // Wavefunction(s) for diagram number 1177
+    // (none)
+    // Amplitude(s) for diagram number 1177
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[47], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1178( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1178 OF 1240 ***
+    // Wavefunction(s) for diagram number 1178
+    // (none)
+    // Amplitude(s) for diagram number 1178
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1179( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1179 OF 1240 ***
+    // Wavefunction(s) for diagram number 1179
+    // (none)
+    // Amplitude(s) for diagram number 1179
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1180( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1180 OF 1240 ***
+    // Wavefunction(s) for diagram number 1180
+    // (none)
+    // Amplitude(s) for diagram number 1180
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[72], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1181( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1181 OF 1240 ***
+    // Wavefunction(s) for diagram number 1181
+    // (none)
+    // Amplitude(s) for diagram number 1181
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1182( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1182 OF 1240 ***
+    // Wavefunction(s) for diagram number 1182
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[60], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[72] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 1182
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1183( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1183 OF 1240 ***
+    // Wavefunction(s) for diagram number 1183
+    // (none)
+    // Amplitude(s) for diagram number 1183
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[29], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1184( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1184 OF 1240 ***
+    // Wavefunction(s) for diagram number 1184
+    // (none)
+    // Amplitude(s) for diagram number 1184
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 112 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 118 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1185( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1185 OF 1240 ***
+    // Wavefunction(s) for diagram number 1185
+    // (none)
+    // Amplitude(s) for diagram number 1185
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[27], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 103 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[98], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 102 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1186( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1186 OF 1240 ***
+    // Wavefunction(s) for diagram number 1186
+    // (none)
+    // Amplitude(s) for diagram number 1186
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 26 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 24 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1187( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1187 OF 1240 ***
+    // Wavefunction(s) for diagram number 1187
+    // (none)
+    // Amplitude(s) for diagram number 1187
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[59], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[71], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 60 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[21], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 84 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1188( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1188 OF 1240 ***
+    // Wavefunction(s) for diagram number 1188
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[71] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[59] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[60] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[72] );
+    // Amplitude(s) for diagram number 1188
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[77], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1189( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1189 OF 1240 ***
+    // Wavefunction(s) for diagram number 1189
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[98] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[27] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 1189
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1190( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1190 OF 1240 ***
+    // Wavefunction(s) for diagram number 1190
+    // (none)
+    // Amplitude(s) for diagram number 1190
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[77], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1191( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1191 OF 1240 ***
+    // Wavefunction(s) for diagram number 1191
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[21], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[29] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[71], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[68] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[59], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 1191
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[29], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[68], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1192( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1192 OF 1240 ***
+    // Wavefunction(s) for diagram number 1192
+    // (none)
+    // Amplitude(s) for diagram number 1192
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[98], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1193( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1193 OF 1240 ***
+    // Wavefunction(s) for diagram number 1193
+    // (none)
+    // Amplitude(s) for diagram number 1193
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[39], w_fp[59], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1194( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1194 OF 1240 ***
+    // Wavefunction(s) for diagram number 1194
+    // (none)
+    // Amplitude(s) for diagram number 1194
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1195( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1195 OF 1240 ***
+    // Wavefunction(s) for diagram number 1195
+    // (none)
+    // Amplitude(s) for diagram number 1195
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1196( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1196 OF 1240 ***
+    // Wavefunction(s) for diagram number 1196
+    // (none)
+    // Amplitude(s) for diagram number 1196
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[66], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1197( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1197 OF 1240 ***
+    // Wavefunction(s) for diagram number 1197
+    // (none)
+    // Amplitude(s) for diagram number 1197
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1198( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1198 OF 1240 ***
+    // Wavefunction(s) for diagram number 1198
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[21], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[66] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[71], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[59], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[71] );
+    // Amplitude(s) for diagram number 1198
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[66], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[71], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1199( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1199 OF 1240 ***
+    // Wavefunction(s) for diagram number 1199
+    // (none)
+    // Amplitude(s) for diagram number 1199
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[98], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 36 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 109 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1200( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1200 OF 1240 ***
+    // Wavefunction(s) for diagram number 1200
+    // (none)
+    // Amplitude(s) for diagram number 1200
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 88 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 85 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 94 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1201( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1201 OF 1240 ***
+    // Wavefunction(s) for diagram number 1201
+    // (none)
+    // Amplitude(s) for diagram number 1201
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[60], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 79 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[72], w_fp[39], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 78 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1202( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1202 OF 1240 ***
+    // Wavefunction(s) for diagram number 1202
+    // (none)
+    // Amplitude(s) for diagram number 1202
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[66], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 28 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[71], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 25 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 42 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1203( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1203 OF 1240 ***
+    // Wavefunction(s) for diagram number 1203
+    // (none)
+    // Amplitude(s) for diagram number 1203
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[29], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[68], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 66 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 108 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1204( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1204 OF 1240 ***
+    // Wavefunction(s) for diagram number 1204
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[23] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[68] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[2], 1.0, 0., 0., w_fp[29] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[71] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[66] );
+    // Amplitude(s) for diagram number 1204
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1205( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1205 OF 1240 ***
+    // Wavefunction(s) for diagram number 1205
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[72] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[60] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 1205
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1206( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1206 OF 1240 ***
+    // Wavefunction(s) for diagram number 1206
+    // (none)
+    // Amplitude(s) for diagram number 1206
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1207( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1207 OF 1240 ***
+    // Wavefunction(s) for diagram number 1207
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[23], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[77] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[68], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[29], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[27] );
+    // Amplitude(s) for diagram number 1207
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[77], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[27], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1208( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1208 OF 1240 ***
+    // Wavefunction(s) for diagram number 1208
+    // (none)
+    // Amplitude(s) for diagram number 1208
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[72], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[60], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1209( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1209 OF 1240 ***
+    // Wavefunction(s) for diagram number 1209
+    // (none)
+    // Amplitude(s) for diagram number 1209
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[52], w_fp[33], w_fp[29], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1210( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1210 OF 1240 ***
+    // Wavefunction(s) for diagram number 1210
+    // (none)
+    // Amplitude(s) for diagram number 1210
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[77], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[27], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1211( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1211 OF 1240 ***
+    // Wavefunction(s) for diagram number 1211
+    // (none)
+    // Amplitude(s) for diagram number 1211
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1212( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1212 OF 1240 ***
+    // Wavefunction(s) for diagram number 1212
+    // (none)
+    // Amplitude(s) for diagram number 1212
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[61], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1213( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1213 OF 1240 ***
+    // Wavefunction(s) for diagram number 1213
+    // (none)
+    // Amplitude(s) for diagram number 1213
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1214( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1214 OF 1240 ***
+    // Wavefunction(s) for diagram number 1214
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[23], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[61] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[68], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[29], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[68] );
+    // Amplitude(s) for diagram number 1214
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[61], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[68], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1215( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1215 OF 1240 ***
+    // Wavefunction(s) for diagram number 1215
+    // (none)
+    // Amplitude(s) for diagram number 1215
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[72], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[60], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 31 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 91 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[24], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 30 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 115 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1216( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1216 OF 1240 ***
+    // Wavefunction(s) for diagram number 1216
+    // (none)
+    // Amplitude(s) for diagram number 1216
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 64 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 61 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 67 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 70 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1217( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1217 OF 1240 ***
+    // Wavefunction(s) for diagram number 1217
+    // (none)
+    // Amplitude(s) for diagram number 1217
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[71], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 55 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[66], w_fp[33], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 54 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1218( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1218 OF 1240 ***
+    // Wavefunction(s) for diagram number 1218
+    // (none)
+    // Amplitude(s) for diagram number 1218
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[61], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 29 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[68], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 27 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 37 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 43 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1219( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1219 OF 1240 ***
+    // Wavefunction(s) for diagram number 1219
+    // (none)
+    // Amplitude(s) for diagram number 1219
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[77], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[16], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 90 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[27], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 114 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1220( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1220 OF 1240 ***
+    // Wavefunction(s) for diagram number 1220
+    // (none)
+    // Amplitude(s) for diagram number 1220
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], w_fp[8], w_fp[6], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1221( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1221 OF 1240 ***
+    // Wavefunction(s) for diagram number 1221
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[73], COUPs[0], 1.0, 0., 0., w_fp[27] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[79], COUPs[0], 1.0, 0., 0., w_fp[1] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[80], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 1221
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[27], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[1], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[6], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1222( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1222 OF 1240 ***
+    // Wavefunction(s) for diagram number 1222
+    // (none)
+    // Amplitude(s) for diagram number 1222
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[73], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[79], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 39 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[80], w_fp[6], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 33 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1223( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1223 OF 1240 ***
+    // Wavefunction(s) for diagram number 1223
+    // (none)
+    // Amplitude(s) for diagram number 1223
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 113 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[47], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 119 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1224( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1224 OF 1240 ***
+    // Wavefunction(s) for diagram number 1224
+    // (none)
+    // Amplitude(s) for diagram number 1224
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 97 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 99 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[113], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 96 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 98 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 100 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 101 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1225( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1225 OF 1240 ***
+    // Wavefunction(s) for diagram number 1225
+    // (none)
+    // Amplitude(s) for diagram number 1225
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[27], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[41], w_fp[2], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1226( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1226 OF 1240 ***
+    // Wavefunction(s) for diagram number 1226
+    // (none)
+    // Amplitude(s) for diagram number 1226
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[73], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 38 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 62 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[62], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 32 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 56 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 80 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 86 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1227( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1227 OF 1240 ***
+    // Wavefunction(s) for diagram number 1227
+    // (none)
+    // Amplitude(s) for diagram number 1227
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], w_fp[8], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1228( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1228 OF 1240 ***
+    // Wavefunction(s) for diagram number 1228
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[57], COUPs[0], 1.0, 0., 0., w_fp[62] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[81], COUPs[0], 1.0, 0., 0., w_fp[80] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[82], COUPs[0], 1.0, 0., 0., w_fp[79] );
+    // Amplitude(s) for diagram number 1228
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[62], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[80], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[79], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1229( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1229 OF 1240 ***
+    // Wavefunction(s) for diagram number 1229
+    // (none)
+    // Amplitude(s) for diagram number 1229
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[57], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[81], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 45 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[82], w_fp[5], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 35 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 105 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 111 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1230( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1230 OF 1240 ***
+    // Wavefunction(s) for diagram number 1230
+    // (none)
+    // Amplitude(s) for diagram number 1230
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 89 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[39], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 81 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 87 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 95 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1231( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1231 OF 1240 ***
+    // Wavefunction(s) for diagram number 1231
+    // (none)
+    // Amplitude(s) for diagram number 1231
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 73 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 75 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[102], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 72 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 74 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 76 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 77 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1232( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1232 OF 1240 ***
+    // Wavefunction(s) for diagram number 1232
+    // (none)
+    // Amplitude(s) for diagram number 1232
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[62], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[80], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[38], w_fp[2], w_fp[79], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1233( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1233 OF 1240 ***
+    // Wavefunction(s) for diagram number 1233
+    // (none)
+    // Amplitude(s) for diagram number 1233
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[57], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 44 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 68 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[104], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 34 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 58 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 104 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 110 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1234( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1234 OF 1240 ***
+    // Wavefunction(s) for diagram number 1234
+    // (none)
+    // Amplitude(s) for diagram number 1234
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], w_fp[8], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1235( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1235 OF 1240 ***
+    // Wavefunction(s) for diagram number 1235
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[55], COUPs[0], 1.0, 0., 0., w_fp[104] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[83], COUPs[0], 1.0, 0., 0., w_fp[82] );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[84], COUPs[0], 1.0, 0., 0., w_fp[81] );
+    // Amplitude(s) for diagram number 1235
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[104], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[82], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[81], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1236( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1236 OF 1240 ***
+    // Wavefunction(s) for diagram number 1236
+    // (none)
+    // Amplitude(s) for diagram number 1236
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[55], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[83], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 47 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 93 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[84], w_fp[4], w_fp[56], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 41 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 83 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 107 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 117 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1237( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1237 OF 1240 ***
+    // Wavefunction(s) for diagram number 1237
+    // (none)
+    // Amplitude(s) for diagram number 1237
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 59 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 65 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[33], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 57 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 63 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 69 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 71 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1238( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1238 OF 1240 ***
+    // Wavefunction(s) for diagram number 1238
+    // (none)
+    // Amplitude(s) for diagram number 1238
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 49 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 51 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[114], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 48 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 50 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 52 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 53 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1239( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1239 OF 1240 ***
+    // Wavefunction(s) for diagram number 1239
+    // (none)
+    // Amplitude(s) for diagram number 1239
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[104], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[82], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[46], w_fp[2], w_fp[81], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1240( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+               fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+               const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+               const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+               const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+               fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+               fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 1240 OF 1240 ***
+    // Wavefunction(s) for diagram number 1240
+    // (none)
+    // Amplitude(s) for diagram number 1240
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[55], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[83], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 46 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 92 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[99], w_fp[2], w_fp[84], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 40 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 82 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 106 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 116 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
index 53dd560ed6..c30f753dcb 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -963,7 +963,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -976,7 +976,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -990,7 +990,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1003,7 +1003,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1017,7 +1017,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -1030,7 +1030,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1053,7 +1053,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1066,7 +1066,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1091,7 +1091,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1104,7 +1104,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1116,7 +1116,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1129,7 +1129,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1148,7 +1148,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1161,7 +1161,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1180,7 +1180,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1193,7 +1193,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1211,7 +1211,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1226,7 +1226,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
@@ -1241,7 +1241,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1256,7 +1256,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1276,7 +1276,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -1291,7 +1291,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1306,7 +1306,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1321,7 +1321,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1341,7 +1341,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1356,7 +1356,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1371,7 +1371,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1386,7 +1386,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 1baee42e06..31ed3df613 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +57,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006198406219482422 [0m
+[1;32mDEBUG: model prefixing  takes 0.005407094955444336 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -171,16 +171,16 @@ Total: 8 processes with 40 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -200,9 +200,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -211,62 +211,62 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
-Wrote files for 32 helas calls in 0.164 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1665][0m [0m
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
+Wrote files for 32 helas calls in 0.167 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.148 s
+ALOHA: aloha creates 2 routines in  0.141 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.127 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 254 (offset 27 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 254 (offset 27 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.210s
-user	0m1.890s
-sys	0m0.303s
-Code generation completed in 2 seconds
+real	0m2.227s
+user	0m1.929s
+sys	0m0.297s
+Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -279,7 +279,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -287,9 +287,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -309,7 +309,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -317,9 +317,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index 795e11afaf..9dfd669871 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
index 66a805e521..3db737130c 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
@@ -109,6 +109,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
index 8c0f1e2199..47c2051950 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
@@ -109,6 +109,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/.make_opts
+++ b/epochX/cudacpp/gq_ttq.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/gq_ttq.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc
+++ b/epochX/cudacpp/gq_ttq.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts
+++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/makefile b/epochX/cudacpp/gq_ttq.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/makefile
+++ b/epochX/cudacpp/gq_ttq.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc
+++ b/epochX/cudacpp/gq_ttq.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 99573ab87a..1ee522dbfd 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,333 +282,143 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 5 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 5 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 5 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
 
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 5 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 5 ***
 
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 5 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -610,7 +473,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +511,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -787,26 +662,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +689,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +735,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +745,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -885,17 +776,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,35 +829,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -970,20 +1049,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,17 +1068,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1031,93 +1107,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1228,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,25 +1237,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1219,8 +1271,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1290,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1397,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index b501a9772e..3956ab144a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 5;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index b0cc58e89c..2d49642e74 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index 2b281a8200..a45203b57e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagrams.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagrams.h
new file mode 100644
index 0000000000..941311a2bb
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/diagrams.h
@@ -0,0 +1,174 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 5 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 5 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 5 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 5 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 5 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index 1efce64e40..645a4d6016 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -341,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -387,7 +387,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -430,31 +431,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(5,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(5,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,5,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,5,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -507,10 +505,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -519,6 +519,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index 6dc0abd17c..50d05d273c 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,333 +282,143 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 5 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 5 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 5 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
 
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 5 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 5 ***
 
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 5 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -610,7 +473,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +511,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -787,26 +662,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +689,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +735,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +745,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -885,17 +776,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,35 +829,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -970,20 +1049,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,17 +1068,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1031,93 +1107,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1228,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,25 +1237,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1219,8 +1271,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1290,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1397,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index d658e0394e..d3f0d16633 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 5;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index e36675626f..a246f2aab0 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 61bb13c3e7..d50f96bb8d 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagrams.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagrams.h
new file mode 100644
index 0000000000..efb8277d2c
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/diagrams.h
@@ -0,0 +1,174 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 5 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 5 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 5 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 5 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 5 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index c8fbbe9e22..cc34d12626 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -341,7 +341,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -387,7 +387,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -430,31 +431,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,5)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,5) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,5)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,5) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -507,10 +505,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -519,6 +519,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/madevent b/epochX/cudacpp/gq_ttq.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/madevent
+++ b/epochX/cudacpp/gq_ttq.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
index a304fc85c8..616eab36fd 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -948,7 +948,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -961,7 +961,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -980,7 +980,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -993,7 +993,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1012,7 +1012,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1025,7 +1025,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1043,7 +1043,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -1056,7 +1056,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
index 998cb505a0..1dfc54c553 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
index 1565ed5888..a4ace231d8 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -300,7 +300,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -310,10 +310,10 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
     GC_11s_sv = couplings_sv.GC_11;
     GC_10s_sv = couplings_sv.GC_10;
     mgDebug( 1, __FUNCTION__ );
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 8249ac5d67..948a9250a9 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +57,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006358146667480469 [0m
+[1;32mDEBUG: model prefixing  takes 0.005482912063598633 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -166,13 +166,13 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.079 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -184,45 +184,45 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
+ALOHA: aloha creates 2 routines in  0.141 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m0.670s
-user	0m0.588s
-sys	0m0.061s
-Code generation completed in 1 seconds
+real	0m0.684s
+user	0m0.589s
+sys	0m0.066s
+Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
index 81ab8669a5..1ee522dbfd 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,328 +282,143 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 5 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 5 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 5 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
 
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 5 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 5 ***
 
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 5 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -605,7 +473,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -639,6 +511,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -680,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -782,26 +662,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -809,25 +689,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -840,7 +735,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -850,26 +745,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -880,17 +776,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -927,35 +829,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -972,13 +1056,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -990,17 +1068,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1026,93 +1107,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1154,7 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1177,7 +1228,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1186,25 +1237,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1214,8 +1271,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1231,11 +1290,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1337,14 +1397,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
index b501a9772e..3956ab144a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 5;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagrams.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagrams.h
new file mode 100644
index 0000000000..186e3362ee
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/diagrams.h
@@ -0,0 +1,169 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 5 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 5 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 5 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 5 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 5 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
index c1c42990a2..50d05d273c 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,328 +282,143 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 5 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 5 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 5 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
 
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 5 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 5 ***
 
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 5 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -605,7 +473,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -639,6 +511,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -680,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -782,26 +662,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -809,25 +689,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -840,7 +735,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -850,26 +745,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -880,17 +776,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -927,35 +829,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -972,13 +1056,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -990,17 +1068,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1026,93 +1107,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1154,7 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1177,7 +1228,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1186,25 +1237,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1214,8 +1271,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1231,11 +1290,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1337,14 +1397,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
index d658e0394e..d3f0d16633 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 5;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagrams.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagrams.h
new file mode 100644
index 0000000000..cf3655728d
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/diagrams.h
@@ -0,0 +1,169 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 5 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 5 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 5 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 5 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 5 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
index a304fc85c8..616eab36fd 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -948,7 +948,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -961,7 +961,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -980,7 +980,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -993,7 +993,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1012,7 +1012,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1025,7 +1025,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1043,7 +1043,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -1056,7 +1056,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
index 998cb505a0..1dfc54c553 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
index 1565ed5888..a4ace231d8 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -300,7 +300,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -310,10 +310,10 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
     GC_11s_sv = couplings_sv.GC_11;
     GC_10s_sv = couplings_sv.GC_10;
     mgDebug( 1, __FUNCTION__ );
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/gq_ttq.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
index c46ef95a65..56d680a7fe 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/CODEGEN_mad_heft_gg_bb_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -128,16 +128,16 @@ Total: 1 processes with 4 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_heft_gg_bb --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_heft_gg_bb 
 INFO: remove old information in CODEGEN_mad_heft_gg_bb 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
@@ -149,59 +149,59 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_bbx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 4 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4} [1;30m[model_handling.py at line 1665][0m [0m
 Generated helas calls for 1 subprocesses (4 diagrams) in 0.009 s
-Wrote files for 12 helas calls in 0.076 s
+Wrote files for 12 helas calls in 0.077 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.266 s
+ALOHA: aloha creates 4 routines in  0.257 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 8 routines in  0.250 s
+ALOHA: aloha creates 8 routines in  0.240 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses/P1_gg_bbx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/SubProcesses/P1_gg_bbx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.141s
-user	0m1.860s
-sys	0m0.270s
+real	0m2.155s
+user	0m1.886s
+sys	0m0.264s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -215,7 +215,7 @@ Code generation completed in 2 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -223,9 +223,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -245,7 +245,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -253,9 +253,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_heft_gg_bb/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
index 92581deeee..8b5e2d66c2 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat
index 8af20dc4e4..3802880982 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat
index 0815703ee4..6917ce597f 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/heft_gg_bb.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/makefile
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc
+++ b/epochX/cudacpp/heft_gg_bb.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h
index e98a172df1..4e154ca3bc 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
index 90075da66e..110b93643f 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
index 5d6a4e1f06..1f301c5523 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_heft_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 3;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,311 +279,141 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 4 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVS3_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFS2_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= 2. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 4 ***
 
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 4 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 4 OF 4 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
 
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_bbx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
-
-      // The color matrix (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2, 6 },
-        { -2, 16, 6 },
-        { 2, 2, 6 } }; // 2-D array[3][3]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 4 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -569,7 +452,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -602,6 +489,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MB );
     m_masses.push_back( m_pars->mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH };
@@ -643,6 +534,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_heft::ZERO );
     m_masses.push_back( Parameters_heft::mdl_MB );
     m_masses.push_back( Parameters_heft::mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -745,26 +640,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -772,25 +667,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -803,7 +713,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -813,26 +723,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -843,17 +754,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -890,35 +807,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -928,20 +1027,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -953,17 +1046,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -989,93 +1085,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1117,7 +1183,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1140,7 +1206,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1149,25 +1215,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1177,8 +1249,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1194,11 +1268,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1300,14 +1375,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
index 30c5663297..c519f81e85 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_heft.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 4;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f
index 0b39d55964..252d00c684 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
index c57e06d578..3fef361dd9 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc
new file mode 100644
index 0000000000..6b493df318
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.cc
@@ -0,0 +1,384 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
+
+  // The color matrix (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2, 6 },
+    { -2, 16, 6 },
+    { 2, 2, 6 } }; // 2-D array[3][3]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagrams.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagrams.h
new file mode 100644
index 0000000000..24cc14c39e
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/diagrams.h
@@ -0,0 +1,138 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 4 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVS3_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFS2_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 4 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 4 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 4 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f
index ec5722702a..c08048ad0e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
index 598338d03e..2a2fccda40 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,26 +394,26 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  3) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  3) /16,-4,12/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  3) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I=  4,  5) /16,12/
 C     1 T(2,1,3,4)
-      DATA (CF(I,  3),I=  1,  3) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,6.000000000000000D+00/
+      DATA (CF(I),I=  6,  6) /18/
 C     1 T(3,4) Tr(1,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(MDL_MB
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WH.NE.0D0) FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WH)
+        FK_ZERO = 0D0
+        IF(MDL_WH.NE.0D0) THEN
+          FK_MDL_WH = SIGN(MAX(ABS(MDL_WH), ABS(MDL_MH
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WH)
+        ELSE
+          FK_MDL_WH = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -455,10 +456,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -467,6 +470,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/heft_gg_bb.mad/bin/madevent
+++ b/epochX/cudacpp/heft_gg_bb.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h
index 1b04401547..9e5b1e3584 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/HelAmps_heft.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVS3_3( const fptype allV1[],
           const fptype allV2[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -886,7 +886,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFS2_0( const fptype allF1[],
           const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVS3_3( const fptype allV1[],
           const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -970,7 +970,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -983,7 +983,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1008,7 +1008,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1021,7 +1021,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP8 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1033,7 +1033,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1046,7 +1046,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1065,7 +1065,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1078,7 +1078,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1097,7 +1097,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFS2_0( const fptype allF1[],
           const fptype allF2[],
@@ -1110,7 +1110,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * F2[2] + F1[3] * F2[3] + F1[4] * F2[4] + F1[5] * F2[5] );
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc
index 0fa5a34cf0..a9b14b3a06 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h
index 0faa7bb71e..45c7bd04c2 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/Parameters_heft.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -335,7 +335,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -345,12 +345,12 @@ namespace mg5amcCpu
     using namespace Parameters_heft_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_13s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    cxtype_sv_ref GC_13s_sv = C_ACCESS::kernelAccess( GC_13s );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
+    fptype* GC_13s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    cxtype_sv_ref GC_13s_sv = CD_ACCESS::kernelAccess( GC_13s );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
     GC_13s_sv = couplings_sv.GC_13;
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
index 04039fcd14..bfa5f8322b 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
+++ b/epochX/cudacpp/heft_gg_bb.sa/CODEGEN_cudacpp_heft_gg_bb_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,15 +49,20 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
+INFO: load particles 
+INFO: load vertices 
+[1;34mWARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
+[1;34mWARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
+[1;32mDEBUG: model prefixing  takes 0.00581049919128418 [0m
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: s u w+ at order: QED=1 [0m
@@ -127,45 +132,45 @@ INFO: Process has 4 diagrams
 Total: 1 processes with 4 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_bb
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > b b~ HIG<=1 HIW<=1 @1 
 INFO: Processing color information for process: g g > b b~ HIG<=1 HIW<=1 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/SubProcesses/P1_Sigma_heft_gg_bbx/. 
 Generated helas calls for 1 subprocesses (4 diagrams) in 0.008 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFS2 routines[0m
-ALOHA: aloha creates 4 routines in  0.261 s
+ALOHA: aloha creates 4 routines in  0.259 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFS2
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_bb/src/. 
 quit
 
-real	0m0.646s
-user	0m0.583s
+real	0m0.671s
+user	0m0.604s
 sys	0m0.051s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h
index e98a172df1..4e154ca3bc 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
index 90075da66e..110b93643f 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_heft_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
index b9f394434a..1f301c5523 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_heft.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_heft_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_heft_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 3;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
+#endif
+    return channelId;
+  }
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,307 +279,141 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 4 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVS3_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFS2_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= 2. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 4 ***
 
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 4 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 4 OF 4 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
 
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_bbx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
-
-      // The color matrix (initialize all array elements, with ncolor=3)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2, 6 },
-        { -2, 16, 6 },
-        { 2, 2, 6 } }; // 2-D array[3][3]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 4 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -565,7 +452,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -598,6 +489,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MB );
     m_masses.push_back( m_pars->mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MB, (fptype)m_pars->mdl_MH, (fptype)m_pars->mdl_WH };
@@ -639,6 +534,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_heft::ZERO );
     m_masses.push_back( Parameters_heft::mdl_MB );
     m_masses.push_back( Parameters_heft::mdl_MB );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -741,26 +640,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -768,25 +667,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -799,7 +713,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -809,26 +723,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -839,17 +754,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -886,35 +807,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -931,13 +1034,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -949,17 +1046,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -985,93 +1085,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1113,7 +1183,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1136,7 +1206,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1145,25 +1215,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1173,8 +1249,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1190,11 +1268,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1296,14 +1375,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
index 30c5663297..c519f81e85 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_heft.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 4; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 3; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 4;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc
new file mode 100644
index 0000000000..6b493df318
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.cc
@@ -0,0 +1,384 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 1 }; // 1-D array[3]
+
+  // The color matrix (initialize all array elements, with ncolor=3)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2, 6 },
+    { -2, 16, 6 },
+    { 2, 2, 6 } }; // 2-D array[3][3]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagrams.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagrams.h
new file mode 100644
index 0000000000..ea1cf69605
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/P1_Sigma_heft_gg_bbx/diagrams.h
@@ -0,0 +1,134 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 4 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVS3_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, cIPD[1], cIPD[2], w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFS2_0<W_ACCESS, A_ACCESS, CI_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[ndcoup + 0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 4 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 4 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 4 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[2], 1.0, cIPD[0], 0., w_fp[4] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h
index 1b04401547..9e5b1e3584 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/HelAmps_heft.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVS3_3( const fptype allV1[],
           const fptype allV2[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -886,7 +886,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFS2_0( const fptype allF1[],
           const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], V2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVS3_3( const fptype allV1[],
           const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -970,7 +970,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -983,7 +983,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1008,7 +1008,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1021,7 +1021,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP8 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1033,7 +1033,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1046,7 +1046,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1065,7 +1065,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1078,7 +1078,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1097,7 +1097,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFS2_0( const fptype allF1[],
           const fptype allF2[],
@@ -1110,7 +1110,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * F2[2] + F1[3] * F2[3] + F1[4] * F2[4] + F1[5] * F2[5] );
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc
index 0fa5a34cf0..a9b14b3a06 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h
index 0faa7bb71e..45c7bd04c2 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/Parameters_heft.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -335,7 +335,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -345,12 +345,12 @@ namespace mg5amcCpu
     using namespace Parameters_heft_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_13s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    cxtype_sv_ref GC_13s_sv = C_ACCESS::kernelAccess( GC_13s );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
+    fptype* GC_13s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    cxtype_sv_ref GC_13s_sv = CD_ACCESS::kernelAccess( GC_13s );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
     GC_13s_sv = couplings_sv.GC_13;
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
index 11380fe474..4faaccb09b 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/CODEGEN_mad_nobm_pp_ttW_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +57,7 @@ set zerowidth_tchannel F
 import model sm-no_b_mass
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.006017446517944336 [0m
+[1;32mDEBUG: model prefixing  takes 0.005433559417724609 [0m
 INFO: Restrict model sm-no_b_mass with file models/sm/restrict_no_b_mass.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -181,7 +181,7 @@ INFO: Process u~ d > t t~ w- added to mirror process d u~ > t t~ w-
 INFO: Process c~ s > t t~ w- added to mirror process s c~ > t t~ w- 
 INFO: Process d~ u > t t~ w+ added to mirror process u d~ > t t~ w+ 
 INFO: Process s~ c > t t~ w+ added to mirror process c s~ > t t~ w+ 
-4 processes with 8 diagrams generated in 0.107 s
+4 processes with 8 diagrams generated in 0.110 s
 Total: 4 processes with 8 diagrams
 add process p p > t t~ w j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -223,21 +223,21 @@ INFO: Process d~ g > t t~ w+ u~ added to mirror process g d~ > t t~ w+ u~
 INFO: Process d~ u > t t~ w+ g added to mirror process u d~ > t t~ w+ g 
 INFO: Process s~ g > t t~ w+ c~ added to mirror process g s~ > t t~ w+ c~ 
 INFO: Process s~ c > t t~ w+ g added to mirror process c s~ > t t~ w+ g 
-12 processes with 144 diagrams generated in 0.640 s
+12 processes with 144 diagrams generated in 0.647 s
 Total: 16 processes with 152 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_nobm_pp_ttW --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_nobm_pp_ttW 
 INFO: remove old information in CODEGEN_mad_nobm_pp_ttW 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Processing color information for process: g u > t t~ w+ d @1 
@@ -271,9 +271,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ w+ d WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxwpd 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gd_ttxwmu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -282,9 +282,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d > t t~ w- u WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gd_ttxwmu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gux_ttxwmdx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -293,9 +293,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ w- d~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxwmdx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gdx_ttxwpux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -304,9 +304,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g d~ > t t~ w+ u~ WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gdx_ttxwpux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_udx_ttxwpg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -315,9 +315,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwpg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_dux_ttxwmg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -326,9 +326,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwmg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 12 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P0_udx_ttxwp 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -337,9 +337,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u d~ > t t~ w+ WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group udx_ttxwp 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P0_dux_ttxwm 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -348,21 +348,21 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: d u~ > t t~ w- WEIGHTED<=4 
 INFO: Finding symmetric diagrams for subprocess group dux_ttxwm 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 8 subprocesses (76 diagrams) in 0.202 s
-Wrote files for 212 helas calls in 0.830 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 2 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2} [1;30m[model_handling.py at line 1665][0m [0m
+Generated helas calls for 8 subprocesses (76 diagrams) in 0.200 s
+Wrote files for 212 helas calls in 0.918 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 3 routines in  0.204 s
+ALOHA: aloha creates 3 routines in  0.198 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
-ALOHA: aloha creates 6 routines in  0.200 s
+ALOHA: aloha creates 6 routines in  0.195 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -370,73 +370,73 @@ ALOHA: aloha creates 6 routines in  0.200 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
-INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./HelAmps_sm_no_b_mass.h
+INFO: Created file HelAmps_sm_no_b_mass.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/./Parameters_sm_no_b_mass.cc
 INFO: Created files Parameters_sm_no_b_mass.h and Parameters_sm_no_b_mass.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_dux_ttxwm; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_dux_ttxwm; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 268 (offset 41 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_udx_ttxwp; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P0_udx_ttxwp; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 268 (offset 41 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_dux_ttxwmg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_dux_ttxwmg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gd_ttxwmu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gd_ttxwmu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gdx_ttxwpux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gdx_ttxwpux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gu_ttxwpd; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gu_ttxwpd; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gux_ttxwmdx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_gux_ttxwmdx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 316 (offset 89 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_udx_ttxwpg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/SubProcesses/P1_udx_ttxwpg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 72 (offset 1 line).
 Hunk #2 succeeded at 316 (offset 89 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.658s
-user	0m4.105s
-sys	0m0.537s
+real	0m4.835s
+user	0m4.190s
+sys	0m0.553s
 Code generation completed in 5 seconds
 ************************************************************
 *                                                          *
@@ -450,7 +450,7 @@ Code generation completed in 5 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -458,9 +458,9 @@ Code generation completed in 5 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -480,7 +480,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -488,9 +488,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_nobm_pp_ttW/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
index 72b31976a0..9d9e01b7c5 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat
index 5eca3e3f2b..48beb899d9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card.dat
@@ -127,6 +127,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat
index 3b445d02a0..c22a9e0249 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Cards/run_card_default.dat
@@ -127,6 +127,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc
index 2588190439..e169c1f193 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h
index 7b12b981ec..8a49e698cb 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
index 1e7cc050f7..129dd0150a 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_no_b_mass_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
index 97050f0aa2..6099d099a9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,288 +280,137 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-
-      // *** DIAGRAM 1 OF 2 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
 
-      // *** DIAGRAM 2 OF 2 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_dux_ttxwm()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 9, 3 },
-        { 3, 9 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 2 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -579,7 +481,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -613,6 +519,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -655,6 +565,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -757,26 +671,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -784,25 +698,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -815,7 +744,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -825,26 +754,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -855,17 +785,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -902,35 +838,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -938,22 +1056,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -965,17 +1077,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1001,93 +1116,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1129,7 +1214,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1152,7 +1237,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1161,25 +1246,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1189,8 +1280,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1206,11 +1299,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1312,14 +1406,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
index 9d6c262053..3837ab2e64 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 2;
     //static const int ncomb = 48; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f
index 7f7324dc0b..bcd37d9641 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f
index 08dd1f728a..27f2a0aae3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc
new file mode 100644
index 0000000000..bbee00495a
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 9, 3 },
+    { 3, 9 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagrams.h
new file mode 100644
index 0000000000..bb3b936ca7
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/diagrams.h
@@ -0,0 +1,83 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 2 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 2 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f
index 531dfa0771..8963914a5c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f
index 5c47e1c729..f8745a68c0 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_dux_ttxwm/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -399,7 +399,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(1)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -444,23 +445,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /9.000000000000000D+00
-     $ ,3.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  2) /9,6/
 C     1 T(2,1) T(3,4)
-      DATA (CF(I,  2),I=  1,  2) /3.000000000000000D+00
-     $ ,9.000000000000000D+00/
+      DATA (CF(I),I=  3,  3) /9/
 C     1 T(2,4) T(3,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -497,10 +506,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -509,6 +520,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
index 57246ba1e7..3a07e52836 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,288 +280,137 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-
-      // *** DIAGRAM 1 OF 2 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
 
-      // *** DIAGRAM 2 OF 2 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_udx_ttxwp()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 9, 3 },
-        { 3, 9 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 2 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -579,7 +481,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -613,6 +519,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -655,6 +565,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -757,26 +671,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -784,25 +698,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -815,7 +744,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -825,26 +754,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -855,17 +785,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -902,35 +838,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -938,22 +1056,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -965,17 +1077,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1001,93 +1116,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1129,7 +1214,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1152,7 +1237,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1161,25 +1246,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1189,8 +1280,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1206,11 +1299,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1312,14 +1406,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
index cd8edd3e39..7ffb85326e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 48; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 2; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 7; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 2;
     //static const int ncomb = 48; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f
index 2e439af0a3..b5e5d182dd 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f
index 0808ce67ce..6abc3b39fd 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
         C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
         DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc
new file mode 100644
index 0000000000..bbee00495a
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 9, 3 },
+    { 3, 9 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagrams.h
new file mode 100644
index 0000000000..bb3b936ca7
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/diagrams.h
@@ -0,0 +1,83 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 2 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 2 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f
index 531dfa0771..8963914a5c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f
index bbf708250a..beb5bf4d5c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P0_udx_ttxwp/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -399,7 +399,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(1)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -444,23 +445,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /9.000000000000000D+00
-     $ ,3.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  2) /9,6/
 C     1 T(2,1) T(3,4)
-      DATA (CF(I,  2),I=  1,  2) /3.000000000000000D+00
-     $ ,9.000000000000000D+00/
+      DATA (CF(I),I=  3,  3) /9/
 C     1 T(2,4) T(3,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -497,10 +506,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -509,6 +520,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
index 3261780672..6c8a1c5d5b 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,436 +280,157 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 12 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 12 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 12 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 12 ***
-
-      // Wavefunction(s) for diagram number 4
-      // (none)
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 12 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 12 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 12 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 12 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 12 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 12 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 12 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 12 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_dux_ttxwmg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 12 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -775,7 +549,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +588,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +635,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -955,26 +741,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +768,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +814,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +824,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1053,17 +855,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,35 +908,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1136,22 +1126,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,17 +1147,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1199,93 +1186,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1284,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1307,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,25 +1316,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1387,8 +1350,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1369,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1476,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
index ecb184f729..4dd1c0e001 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 12;
     //static const int ncomb = 96; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f
index 26d6979a1d..8840068613 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f
index 330b566ed8..110863be58 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         S1=PDG2PDF(LPP(IB(1)),3, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagrams.h
new file mode 100644
index 0000000000..81fbede8ee
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/diagrams.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 12 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 12 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 12 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 12 ***
+    // Wavefunction(s) for diagram number 4
+    // (none)
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 12 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 12 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 12 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 12 ***
+    // Wavefunction(s) for diagram number 8
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 12 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 12 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 12 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 12 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f
index d8518f17f7..21de29dc55 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f
index 4b8ccfcacb..1d1624ea30 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_dux_ttxwmg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(2,1) T(6,3,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(2,4) T(6,3,1)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(3,1) T(6,2,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(3,4) T(6,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
index c933a8f276..64b881af2b 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,436 +280,157 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 12 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 12 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 12 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 12 ***
-
-      // Wavefunction(s) for diagram number 4
-      // (none)
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 12 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 12 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 12 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 12 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 12 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 12 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 12 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 12 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gd_ttxwmu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 12 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -775,7 +549,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +588,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +635,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -955,26 +741,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +768,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +814,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +824,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1053,17 +855,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,35 +908,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1136,22 +1126,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,17 +1147,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1199,93 +1186,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1284,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1307,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,25 +1316,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1387,8 +1350,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1369,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1476,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
index a5c44d3213..4d52728816 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 12;
     //static const int ncomb = 96; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f
index 3779397ce4..307fe1f6cf 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f
index 1dae307565..ac0a02a97d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagrams.h
new file mode 100644
index 0000000000..24865a9858
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/diagrams.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 12 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 12 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 12 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 12 ***
+    // Wavefunction(s) for diagram number 4
+    // (none)
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 12 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 12 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 12 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 12 ***
+    // Wavefunction(s) for diagram number 8
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 12 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 12 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 12 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 12 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f
index d8518f17f7..21de29dc55 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f
index a3a57cd8b8..af49c9f60d 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gd_ttxwmu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,6,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,6,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
index 6f1f37d1eb..b7d2cbe189 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,436 +280,157 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 12 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 12 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 12 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 12 ***
-
-      // Wavefunction(s) for diagram number 4
-      // (none)
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 12 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 12 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 12 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 12 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 12 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 12 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 12 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 12 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gdx_ttxwpux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 12 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -775,7 +549,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +588,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +635,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -955,26 +741,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +768,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +814,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +824,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1053,17 +855,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,35 +908,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1136,22 +1126,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,17 +1147,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1199,93 +1186,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1284,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1307,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,25 +1316,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1387,8 +1350,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1369,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1476,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
index d0dd16c512..47a8d7c2ca 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 12;
     //static const int ncomb = 96; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f
index 7c1bbde100..807ce0e6c9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f
index ece4509a8c..bf57b49c26 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
         DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagrams.h
new file mode 100644
index 0000000000..a4f3be5d0a
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/diagrams.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 12 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 12 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 12 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 12 ***
+    // Wavefunction(s) for diagram number 4
+    // (none)
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 12 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 12 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 12 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 12 ***
+    // Wavefunction(s) for diagram number 8
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 12 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 12 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 12 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 12 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f
index d8518f17f7..21de29dc55 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f
index e550640e16..7233e3c74f 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gdx_ttxwpux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,6)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,6) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,6)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,6) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
index 16d1e89a53..5cf114c48b 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,436 +280,157 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 12 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 12 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 12 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 12 ***
-
-      // Wavefunction(s) for diagram number 4
-      // (none)
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 12 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 12 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 12 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 12 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 12 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 12 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 12 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 12 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxwpd()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 12 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -775,7 +549,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +588,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +635,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -955,26 +741,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +768,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +814,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +824,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1053,17 +855,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,35 +908,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1136,22 +1126,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,17 +1147,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1199,93 +1186,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1284,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1307,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,25 +1316,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1387,8 +1350,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1369,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1476,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
index f799f32129..15fdb6df3c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 12;
     //static const int ncomb = 96; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f
index e5ddbf348a..180917495b 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f
index 4ebece2e78..e068973bd6 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
         C2=PDG2PDF(LPP(IB(2)),4, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagrams.h
new file mode 100644
index 0000000000..24865a9858
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/diagrams.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 12 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 12 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 12 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 12 ***
+    // Wavefunction(s) for diagram number 4
+    // (none)
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 12 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 12 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 12 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 12 ***
+    // Wavefunction(s) for diagram number 8
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 12 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 12 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 12 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 12 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f
index d8518f17f7..21de29dc55 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f
index 738301d049..765b11c693 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gu_ttxwpd/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,6,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,6,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
index 41a6e0002f..29d5e3e1ef 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,436 +280,157 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 12 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 12 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 12 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 12 ***
-
-      // Wavefunction(s) for diagram number 4
-      // (none)
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 12 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 12 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 12 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 12 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 12 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 12 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 12 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 12 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxwmdx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 12 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -775,7 +549,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +588,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +635,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -955,26 +741,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +768,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +814,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +824,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1053,17 +855,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,35 +908,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1136,22 +1126,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,17 +1147,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1199,93 +1186,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1284,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1307,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,25 +1316,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1387,8 +1350,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1369,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1476,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
index b6253b6715..68dd39bcf2 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 12;
     //static const int ncomb = 96; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f
index 8e03eed7eb..5be1675ba2 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f
index 9d0ddcecfc..f2f893278c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,14 +138,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         UX2=PDG2PDF(LPP(IB(2)),-2, IB(2),XBK(IB(2)), QSCALE)
@@ -224,7 +224,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -296,6 +296,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -379,14 +383,14 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -498,6 +502,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagrams.h
new file mode 100644
index 0000000000..a4f3be5d0a
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/diagrams.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 12 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 12 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 12 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[5], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 12 ***
+    // Wavefunction(s) for diagram number 4
+    // (none)
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 12 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 12 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 12 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 12 ***
+    // Wavefunction(s) for diagram number 8
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 12 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 12 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 12 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 12 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[0], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f
index d8518f17f7..21de29dc55 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f
index 6b3ff14d2d..81f884204a 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_gux_ttxwmdx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,6)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,6) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,6)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,6) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
index f90db593a9..b57648e9f6 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm_no_b_mass.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -97,20 +99,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_no_b_mass_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_no_b_mass_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -169,57 +167,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -227,436 +280,157 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 12 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 12 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 12 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
-      FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 12 ***
-
-      // Wavefunction(s) for diagram number 4
-      // (none)
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 12 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 12 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 12 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 12 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 12 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 12 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 12 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 12 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
 
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_udx_ttxwpg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 12 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
-#else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -775,7 +549,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -810,6 +588,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MW );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_MW, (fptype)m_pars->mdl_WT };
@@ -853,6 +635,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MT );
     m_masses.push_back( Parameters_sm_no_b_mass::mdl_MW );
     m_masses.push_back( Parameters_sm_no_b_mass::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -955,26 +741,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -982,25 +768,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1013,7 +814,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1023,26 +824,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1053,17 +855,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1100,35 +908,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1136,22 +1126,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1163,17 +1147,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1199,93 +1186,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1327,7 +1284,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1350,7 +1307,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1359,25 +1316,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1387,8 +1350,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1404,11 +1369,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1510,14 +1476,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
index b4a0ccb74d..5e49bb346c 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm_no_b_mass.h"
 
 #include <vector>
@@ -76,17 +77,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 96; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 12; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 12;
     //static const int ncomb = 96; // CPPProcess::ncomb
 
@@ -123,23 +124,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -153,34 +157,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f
index 7e750641c8..4d39b68db0 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f
index 28ad0eed08..7cbb2180db 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -138,7 +138,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
         C1=PDG2PDF(LPP(IB(1)),4, IB(1),XBK(IB(1)), QSCALE)
@@ -146,7 +146,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
         DX2=PDG2PDF(LPP(IB(2)),-1, IB(2),XBK(IB(2)), QSCALE)
@@ -225,7 +225,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -297,6 +297,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -380,16 +384,16 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -501,6 +505,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagrams.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagrams.h
new file mode 100644
index 0000000000..81fbede8ee
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/diagrams.h
@@ -0,0 +1,388 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 12 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[1], cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[7] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[6], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 12 ***
+    // Wavefunction(s) for diagram number 2
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[1], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 12 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[6] );
+    FFV2_2<W_ACCESS, CI_ACCESS>( w_fp[0], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 12 ***
+    // Wavefunction(s) for diagram number 4
+    // (none)
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 12 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[0], 1.0, cIPD[0], cIPD[2], w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 12 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[3], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 12 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[3], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 12 ***
+    // Wavefunction(s) for diagram number 8
+    FFV2_1<W_ACCESS, CI_ACCESS>( w_fp[3], w_fp[4], COUPs[ndcoup + 0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 12 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 12 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 12 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 12 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f
index d8518f17f7..21de29dc55 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f
index 536bec2827..fecf2b47e6 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/P1_udx_ttxwpg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -403,7 +403,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -447,7 +447,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(7)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -492,33 +493,35 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(2,1) T(6,3,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(2,4) T(6,3,1)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(3,1) T(6,2,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(3,4) T(6,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
-        IF(MDL_WW.NE.0D0) FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
+        IF(MDL_WW.NE.0D0) THEN
+          FK_MDL_WW = SIGN(MAX(ABS(MDL_WW), ABS(MDL_MW
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WW)
+        ELSE
+          FK_MDL_WW = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -601,10 +604,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -613,6 +618,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h
index 850b86e0e6..12a9428e02 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/HelAmps_sm_no_b_mass.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_1( const fptype allF2[],
           const fptype allV3[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV2_2( const fptype allF1[],
           const fptype allV3[],
@@ -937,7 +937,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -950,7 +950,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -963,7 +963,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP0 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -975,7 +975,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -988,7 +988,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1007,7 +1007,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1020,7 +1020,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1039,7 +1039,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1052,7 +1052,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1070,7 +1070,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_1( const fptype allF2[],
           const fptype allV3[],
@@ -1083,7 +1083,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1102,7 +1102,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV2_2( const fptype allF1[],
           const fptype allV3[],
@@ -1115,7 +1115,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1134,7 +1134,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1147,7 +1147,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc
index d799b19eeb..39a3bb9a4b 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h
index e448052141..3a15c28ac3 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/Parameters_sm_no_b_mass.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -295,7 +295,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -305,10 +305,10 @@ namespace mg5amcCpu
     using namespace Parameters_sm_no_b_mass_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
     GC_11s_sv = couplings_sv.GC_11;
     GC_10s_sv = couplings_sv.GC_10;
     mgDebug( 1, __FUNCTION__ );
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 0a0d056033..5c42a0cf8e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -57,7 +57,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0064830780029296875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005420684814453125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -208,7 +208,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.135 s
+13 processes with 76 diagrams generated in 0.138 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -374,21 +374,21 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.933 s
+65 processes with 1119 diagrams generated in 1.838 s
 Total: 83 processes with 1202 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Processing color information for process: g g > t t~ g g @2 
@@ -499,9 +499,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 105 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 33, 32: 34, 33: 35, 34: 36, 35: 37, 36: 38, 37: 39, 38: 40, 39: 41, 40: 42, 41: 43, 42: 44, 43: 45, 44: 46, 45: 47, 46: 49, 47: 50, 48: 51, 49: 52, 50: 53, 51: 54, 52: 55, 53: 56, 54: 57, 55: 59, 56: 60, 57: 61, 58: 62, 59: 63, 60: 64, 61: 65, 62: 66, 63: 67, 64: 68, 65: 69, 66: 70, 67: 71, 68: 72, 69: 73, 70: 75, 71: 76, 72: 77, 73: 78, 74: 79, 75: 80, 76: 81, 77: 82, 78: 83, 79: 84, 80: 85, 81: 86, 82: 87, 83: 88, 84: 89, 85: 90, 86: 91, 87: 92, 88: 94, 89: 95, 90: 96, 91: 97, 92: 98, 93: 99, 94: 101, 95: 102, 96: 103, 97: 104, 98: 105, 99: 106, 100: 108, 101: 109, 102: 110, 103: 111, 104: 112, 105: 113} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 33: 31, 34: 32, 35: 33, 36: 34, 37: 35, 38: 36, 39: 37, 40: 38, 41: 39, 42: 40, 43: 41, 44: 42, 45: 43, 46: 44, 47: 45, 49: 46, 50: 47, 51: 48, 52: 49, 53: 50, 54: 51, 55: 52, 56: 53, 57: 54, 59: 55, 60: 56, 61: 57, 62: 58, 63: 59, 64: 60, 65: 61, 66: 62, 67: 63, 68: 64, 69: 65, 70: 66, 71: 67, 72: 68, 73: 69, 75: 70, 76: 71, 77: 72, 78: 73, 79: 74, 80: 75, 81: 76, 82: 77, 83: 78, 84: 79, 85: 80, 86: 81, 87: 82, 88: 83, 89: 84, 90: 85, 91: 86, 92: 87, 94: 88, 95: 89, 96: 90, 97: 91, 98: 92, 99: 93, 101: 94, 102: 95, 103: 96, 104: 97, 105: 98, 106: 99, 108: 100, 109: 101, 110: 102, 111: 103, 112: 104, 113: 105} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -510,9 +510,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -521,9 +521,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -532,9 +532,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -543,9 +543,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 35 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 35, 35: 36} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 35: 34, 36: 35} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -554,9 +554,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 15 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -565,9 +565,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -576,9 +576,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -587,9 +587,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 14 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -598,9 +598,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -609,9 +609,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -620,9 +620,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -631,9 +631,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 7 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -642,9 +642,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -653,9 +653,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -664,9 +664,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -675,9 +675,9 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1665][0m [0m
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1156][0m [0m
 INFO: Creating files in directory . 
@@ -686,25 +686,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.286 s
-Wrote files for 810 helas calls in 2.762 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 1 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1} [1;30m[model_handling.py at line 1665][0m [0m
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.292 s
+Wrote files for 810 helas calls in 2.897 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.340 s
+ALOHA: aloha creates 5 routines in  0.334 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.315 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -717,120 +717,120 @@ ALOHA: aloha creates 10 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 230 (offset 3 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 243 (offset 16 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 246 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 246 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 246 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 275 (offset 48 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 76 (offset 5 lines).
 Hunk #2 succeeded at 280 (offset 53 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 82 (offset 11 lines).
 Hunk #2 succeeded at 286 (offset 59 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 82 (offset 11 lines).
 Hunk #2 succeeded at 286 (offset 59 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 278 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 76 (offset 5 lines).
 Hunk #2 succeeded at 280 (offset 53 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #1 succeeded at 74 (offset 3 lines).
 Hunk #2 succeeded at 278 (offset 51 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m11.258s
-user	0m9.633s
-sys	0m0.984s
-Code generation completed in 12 seconds
+real	0m10.785s
+user	0m9.659s
+sys	0m0.978s
+Code generation completed in 11 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -843,7 +843,7 @@ Code generation completed in 12 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -851,9 +851,9 @@ Code generation completed in 12 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -873,7 +873,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -881,9 +881,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index 33311e49bc..de218516de 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
index 5eb60f35df..fe9c38d826 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
index 38810a6b83..0185201786 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
@@ -125,6 +125,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/makefile b/epochX/cudacpp/pp_tt012j.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/makefile
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc
index 2588190439..e169c1f193 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
index 55504a2b90..caee99a7fd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
index 65a101888d..c8db607db6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_sm_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
index a17c5f1eef..4dfd5786fe 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,297 +279,139 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // *** DIAGRAM 1 OF 3 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 3 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
 
-      // *** DIAGRAM 3 OF 3 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 3 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -555,7 +450,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +487,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +531,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -730,26 +637,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +664,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +710,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +720,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -828,17 +751,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,35 +804,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -913,20 +1024,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,17 +1043,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -974,93 +1082,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1203,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,25 +1212,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1162,8 +1246,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1265,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1372,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
index 2d89e0e244..69c201bb45 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 3;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
index 19278bca59..2db2eb3c0c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
index 42cc7c9d61..25e5bf68ee 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagrams.h
new file mode 100644
index 0000000000..962978409f
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/diagrams.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 3 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 3 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 3 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f
index ec5722702a..c08048ad0e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
index ca1785b808..7ccccfd4a5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
index 0979455d7a..38c869f74d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,271 +282,135 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
+#else
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
       const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
+      const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
-#ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
+
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
       const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
       const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
-      const fptype* COUPs[nxcoup];
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
 
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 1 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_uux_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 9, 3 },
-        { 3, 9 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 1 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -532,7 +449,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -565,6 +486,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -605,6 +530,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -707,26 +636,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -734,25 +663,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -765,7 +709,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -775,26 +719,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -805,17 +750,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -852,35 +803,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -888,22 +1021,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -915,17 +1042,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -951,93 +1081,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1079,7 +1179,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1102,7 +1202,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1111,25 +1211,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1139,8 +1245,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1156,11 +1264,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1262,14 +1371,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
index d6fa3205c0..8c4c55deaa 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 1; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 1;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
index 6558c40922..c954e28a21 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
index 86f844defe..f2f0bfb2b5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc
new file mode 100644
index 0000000000..bbee00495a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 9, 3 },
+    { 3, 9 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagrams.h
new file mode 100644
index 0000000000..f7b6636999
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/diagrams.h
@@ -0,0 +1,51 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 1 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f
index ec5722702a..c08048ad0e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
index ec88a303fa..59b33b94c6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -310,7 +310,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -356,7 +356,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -399,21 +400,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /9.000000000000000D+00
-     $ ,3.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  2) /9,6/
 C     1 T(2,1) T(3,4)
-      DATA (CF(I,  2),I=  1,  2) /3.000000000000000D+00
-     $ ,9.000000000000000D+00/
+      DATA (CF(I),I=  3,  3) /9/
 C     1 T(2,4) T(3,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -444,10 +448,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -456,6 +462,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
index 5de1c626c8..1db10f1e09 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,500 +279,165 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 16 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 2 OF 16 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 16 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 16 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 5 OF 16 ***
-
-      // Wavefunction(s) for diagram number 5
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 16 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 7 OF 16 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 8 OF 16 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 16 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 10 OF 16 ***
-
-      // Wavefunction(s) for diagram number 10
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 16 ***
-
-      // Wavefunction(s) for diagram number 11
-      // (none)
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 16 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 13 OF 16 ***
 
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 14 OF 16 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 15 OF 16 ***
 
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 16 ***
-
-      // Wavefunction(s) for diagram number 16
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 64, -8, -8, 1, 1, 10 },
-        { -8, 64, 1, 10, -8, 1 },
-        { -8, 1, 64, -8, 10, 1 },
-        { 1, 10, -8, 64, 1, -8 },
-        { 1, -8, 10, 1, 64, -8 },
-        { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 16 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -774,7 +492,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -808,6 +530,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -849,6 +575,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -951,26 +681,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -978,25 +708,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1009,7 +754,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1019,26 +764,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1049,17 +795,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1096,35 +848,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1134,20 +1068,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1159,17 +1087,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1195,93 +1126,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1323,7 +1224,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1346,7 +1247,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1355,25 +1256,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1383,8 +1290,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1400,11 +1309,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1506,14 +1416,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
index 2acfa000a7..8e87baf8e2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 16; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 12; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 18;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
index 10496aa04d..163076da52 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
index 7c8695090c..bc9333bb5d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
new file mode 100644
index 0000000000..b76aa16029
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 9, 9, 9, 9, 9, 9 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 64, -8, -8, 1, 1, 10 },
+    { -8, 64, 1, 10, -8, 1 },
+    { -8, 1, 64, -8, 10, 1 },
+    { 1, 10, -8, 64, 1, -8 },
+    { 1, -8, 10, 1, 64, -8 },
+    { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagrams.h
new file mode 100644
index 0000000000..8ea15aedfa
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/diagrams.h
@@ -0,0 +1,515 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 16 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 16 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[7], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 16 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 16 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 16 ***
+    // Wavefunction(s) for diagram number 5
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 16 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 16 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 16 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 16 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 16 ***
+    // Wavefunction(s) for diagram number 10
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 16 ***
+    // Wavefunction(s) for diagram number 11
+    // (none)
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 16 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 16 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[11], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 16 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[7], w_fp[0], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 16 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 16 ***
+    // Wavefunction(s) for diagram number 16
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[10] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[6] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
index 797b19405d..850d121618 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -323,7 +323,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +366,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(9)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -409,43 +410,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /7.111111111111111D+00,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,1.111111111111111D-01,1.111111111111111D-01,1.111111111111111D
-     $ +00/
+      DATA DENOM/9/
+      DATA (CF(I),I=  1,  6) /64,-16,-16,2,2,20/
 C     1 T(1,2,5,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,1.111111111111111D
-     $ +00,-8.888888888888888D-01,1.111111111111111D-01/
+      DATA (CF(I),I=  7, 11) /64,2,20,-16,2/
 C     1 T(1,5,2,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-8.888888888888888D-01
-     $ ,1.111111111111111D-01,7.111111111111111D+00,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01/
+      DATA (CF(I),I= 12, 15) /64,-16,20,2/
 C     1 T(2,1,5,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.111111111111111D-01
-     $ ,1.111111111111111D+00,-8.888888888888888D-01
-     $ ,7.111111111111111D+00,1.111111111111111D-01,
-     $ -8.888888888888888D-01/
+      DATA (CF(I),I= 16, 18) /64,2,-16/
 C     1 T(2,5,1,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.111111111111111D-01,
-     $ -8.888888888888888D-01,1.111111111111111D+00,1.111111111111111D
-     $ -01,7.111111111111111D+00,-8.888888888888888D-01/
+      DATA (CF(I),I= 19, 20) /64,-16/
 C     1 T(5,1,2,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.111111111111111D+00
-     $ ,1.111111111111111D-01,1.111111111111111D-01,
-     $ -8.888888888888888D-01,-8.888888888888888D-01
-     $ ,7.111111111111111D+00/
+      DATA (CF(I),I= 21, 21) /64/
 C     1 T(5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -549,10 +539,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -561,6 +553,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
index 4f8f49270b..1ee522dbfd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,333 +282,143 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 5 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 5 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 5 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
 
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 5 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 5 ***
 
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gu_ttxu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 5 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -610,7 +473,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +511,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -787,26 +662,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +689,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +735,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +745,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -885,17 +776,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,35 +829,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -968,22 +1047,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,17 +1068,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1031,93 +1107,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1228,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,25 +1237,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1219,8 +1271,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1290,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1397,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
index b501a9772e..3956ab144a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 5;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
index b0cc58e89c..2d49642e74 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
index 2b281a8200..a45203b57e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagrams.h
new file mode 100644
index 0000000000..034f15c587
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/diagrams.h
@@ -0,0 +1,174 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 5 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 5 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 5 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 5 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 5 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
index 9394a561b8..2efe3ea8fc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -326,7 +326,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -372,7 +372,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -415,31 +416,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,3,2) T(5,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,3,4) T(5,2)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,5,2) T(3,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,5,4) T(3,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -492,10 +490,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -504,6 +504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
index e2d65a2667..50d05d273c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,333 +282,143 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 5 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 5 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 5 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
 
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 5 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 5 ***
 
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gux_ttxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 5 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -610,7 +473,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +511,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -787,26 +662,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +689,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +735,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +745,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -885,17 +776,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,35 +829,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -968,22 +1047,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,17 +1068,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1031,93 +1107,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1228,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,25 +1237,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1219,8 +1271,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1290,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1397,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
index d658e0394e..d3f0d16633 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 5;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
index e36675626f..a246f2aab0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
index 61bb13c3e7..d50f96bb8d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagrams.h
new file mode 100644
index 0000000000..d44286c433
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/diagrams.h
@@ -0,0 +1,174 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 5 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 5 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 5 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 5 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 5 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
index c7fdad381b..0a96a485e5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -326,7 +326,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -372,7 +372,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -415,31 +416,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(1,2,4) T(3,5)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(1,2,5) T(3,4)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(1,3,4) T(2,5)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(1,3,5) T(2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -492,10 +490,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -504,6 +504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
index 4f41927bc9..b884fba722 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 4;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,333 +282,143 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 5 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 5 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 5 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
 
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 5 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 5 ***
 
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_uux_ttxg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
-
-      // The color matrix (initialize all array elements, with ncolor=4)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 12, 4, 4, 0 },
-        { 4, 12, 0, 4 },
-        { 4, 0, 12, 4 },
-        { 0, 4, 4, 12 } }; // 2-D array[4][4]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 5 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -610,7 +473,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -644,6 +511,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -685,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -787,26 +662,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -814,25 +689,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -845,7 +735,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -855,26 +745,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -885,17 +776,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -932,35 +829,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -968,22 +1047,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -995,17 +1068,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1031,93 +1107,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1159,7 +1205,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1182,7 +1228,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1191,25 +1237,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1219,8 +1271,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1236,11 +1290,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1342,14 +1397,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
index ebf14aca9e..13e15fe5b2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 32; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 5; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 4; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 8; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 5; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 5;
     //static const int ncomb = 32; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
index d46dad4fcb..bfdf29ed05 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
index d8e94d91bb..f1a7d1c5bc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc
new file mode 100644
index 0000000000..389d8f6535
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.cc
@@ -0,0 +1,385 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1 }; // 1-D array[4]
+
+  // The color matrix (initialize all array elements, with ncolor=4)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 12, 4, 4, 0 },
+    { 4, 12, 0, 4 },
+    { 4, 0, 12, 4 },
+    { 0, 4, 4, 12 } }; // 2-D array[4][4]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagrams.h
new file mode 100644
index 0000000000..4281eca976
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/diagrams.h
@@ -0,0 +1,174 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 5 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 5 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 5 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[5] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 5 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[5] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 5 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f
index c2eadb2c31..10332b6238 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
index 787dae76b2..c03378a882 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -326,7 +326,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -372,7 +372,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -415,31 +416,28 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  4) /1.200000000000000D+01
-     $ ,4.000000000000000D+00,4.000000000000000D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  4) /12,8,8,0/
 C     1 T(2,1) T(5,3,4)
-      DATA (CF(I,  2),I=  1,  4) /4.000000000000000D+00
-     $ ,1.200000000000000D+01,0.000000000000000D+00,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  5,  7) /12,0,8/
 C     1 T(2,4) T(5,3,1)
-      DATA (CF(I,  3),I=  1,  4) /4.000000000000000D+00
-     $ ,0.000000000000000D+00,1.200000000000000D+01,4.000000000000000D
-     $ +00/
+      DATA (CF(I),I=  8,  9) /12,8/
 C     1 T(3,1) T(5,2,4)
-      DATA (CF(I,  4),I=  1,  4) /0.000000000000000D+00
-     $ ,4.000000000000000D+00,4.000000000000000D+00,1.200000000000000D
-     $ +01/
+      DATA (CF(I),I= 10, 10) /12/
 C     1 T(3,4) T(5,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -492,10 +490,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -504,6 +504,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
index da962495fd..63e8317212 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 24;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,2412 +279,379 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 123 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-
-      // Amplitude(s) for diagram number 1
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 123 ***
-
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 123 ***
-
-      // Wavefunction(s) for diagram number 3
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 123 ***
-
-      // Wavefunction(s) for diagram number 4
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 4
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 123 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 123 ***
-
-      // Wavefunction(s) for diagram number 6
-      // (none)
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 7 OF 123 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 123 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 123 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 10 OF 123 ***
-
-      // Wavefunction(s) for diagram number 10
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 123 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 123 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 13 OF 123 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 123 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 15 OF 123 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-
-      // *** DIAGRAM 16 OF 123 ***
-
-      // Wavefunction(s) for diagram number 16
-      // (none)
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 17 OF 123 ***
-
-      // Wavefunction(s) for diagram number 17
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= amp_sv[0];
-
-      // *** DIAGRAM 18 OF 123 ***
-
-      // Wavefunction(s) for diagram number 18
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 19 OF 123 ***
-
-      // Wavefunction(s) for diagram number 19
-      // (none)
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 123 ***
-
-      // Wavefunction(s) for diagram number 20
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 21 OF 123 ***
-
-      // Wavefunction(s) for diagram number 21
-      // (none)
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 123 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 123 ***
-
-      // Wavefunction(s) for diagram number 23
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
-
-      // Amplitude(s) for diagram number 23
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 24 OF 123 ***
-
-      // Wavefunction(s) for diagram number 24
-      // (none)
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 123 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 123 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** DIAGRAM 27 OF 123 ***
-
-      // Wavefunction(s) for diagram number 27
-      // (none)
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= amp_sv[0];
-
-      // *** DIAGRAM 28 OF 123 ***
-
-      // Wavefunction(s) for diagram number 28
-      // (none)
-
-      // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= amp_sv[0];
-
-      // *** DIAGRAM 29 OF 123 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= amp_sv[0];
-
-      // *** DIAGRAM 30 OF 123 ***
-
-      // Wavefunction(s) for diagram number 30
-      // (none)
-
-      // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 123 ***
-
-      // Wavefunction(s) for diagram number 31
-      // (none)
-
-      // Amplitude(s) for diagram number 31
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-
-      // *** DIAGRAM 32 OF 123 ***
-
-      // Wavefunction(s) for diagram number 32
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[5] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-
-      // *** DIAGRAM 33 OF 123 ***
-
-      // Wavefunction(s) for diagram number 33
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 34 OF 123 ***
-
-      // Wavefunction(s) for diagram number 34
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] -= amp_sv[0];
-
-      // *** DIAGRAM 35 OF 123 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 123 ***
-
-      // Wavefunction(s) for diagram number 36
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 37 OF 123 ***
-
-      // Wavefunction(s) for diagram number 37
-      // (none)
-
-      // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 38 OF 123 ***
-
-      // Wavefunction(s) for diagram number 38
-      // (none)
-
-      // Amplitude(s) for diagram number 38
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 39 OF 123 ***
-
-      // Wavefunction(s) for diagram number 39
-      // (none)
-
-      // Amplitude(s) for diagram number 39
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[11] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 40 OF 123 ***
 
-      // Wavefunction(s) for diagram number 40
-      // (none)
-
-      // Amplitude(s) for diagram number 40
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 41 OF 123 ***
-
-      // Wavefunction(s) for diagram number 41
-      // (none)
-
-      // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 42 OF 123 ***
-
-      // Wavefunction(s) for diagram number 42
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 43 OF 123 ***
-
-      // Wavefunction(s) for diagram number 43
-      // (none)
-
-      // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[15] -= amp_sv[0];
-
-      // *** DIAGRAM 44 OF 123 ***
-
-      // Wavefunction(s) for diagram number 44
-      // (none)
-
-      // Amplitude(s) for diagram number 44
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 45 OF 123 ***
-
-      // Wavefunction(s) for diagram number 45
-      // (none)
-
-      // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[21] -= amp_sv[0];
-
-      // *** DIAGRAM 46 OF 123 ***
-
-      // Wavefunction(s) for diagram number 46
-      // (none)
-
-      // Amplitude(s) for diagram number 46
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 47 OF 123 ***
-
-      // Wavefunction(s) for diagram number 47
-      // (none)
-
-      // Amplitude(s) for diagram number 47
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 48 OF 123 ***
-
-      // Wavefunction(s) for diagram number 48
-      // (none)
-
-      // Amplitude(s) for diagram number 48
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 49 OF 123 ***
-
-      // Wavefunction(s) for diagram number 49
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 50 OF 123 ***
-
-      // Wavefunction(s) for diagram number 50
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 50
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 51 OF 123 ***
-
-      // Wavefunction(s) for diagram number 51
-      // (none)
-
-      // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 52 OF 123 ***
-
-      // Wavefunction(s) for diagram number 52
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 53 OF 123 ***
-
-      // Wavefunction(s) for diagram number 53
-      // (none)
-
-      // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 54 OF 123 ***
-
-      // Wavefunction(s) for diagram number 54
-      // (none)
-
-      // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 55 OF 123 ***
-
-      // Wavefunction(s) for diagram number 55
-      // (none)
-
-      // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-
-      // *** DIAGRAM 56 OF 123 ***
-
-      // Wavefunction(s) for diagram number 56
-      // (none)
-
-      // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-
-      // *** DIAGRAM 57 OF 123 ***
-
-      // Wavefunction(s) for diagram number 57
-      // (none)
-
-      // Amplitude(s) for diagram number 57
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 58 OF 123 ***
-
-      // Wavefunction(s) for diagram number 58
-      // (none)
-
-      // Amplitude(s) for diagram number 58
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 59 OF 123 ***
-
-      // Wavefunction(s) for diagram number 59
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 59
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 60 OF 123 ***
-
-      // Wavefunction(s) for diagram number 60
-      // (none)
-
-      // Amplitude(s) for diagram number 60
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 61 OF 123 ***
-
-      // Wavefunction(s) for diagram number 61
-      // (none)
-
-      // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-      jamp_sv[21] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 62 OF 123 ***
-
-      // Wavefunction(s) for diagram number 62
-      // (none)
-
-      // Amplitude(s) for diagram number 62
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 63 OF 123 ***
-
-      // Wavefunction(s) for diagram number 63
-      // (none)
-
-      // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += amp_sv[0];
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 64 OF 123 ***
-
-      // Wavefunction(s) for diagram number 64
-      // (none)
-
-      // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 65 OF 123 ***
-
-      // Wavefunction(s) for diagram number 65
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 66 OF 123 ***
-
-      // Wavefunction(s) for diagram number 66
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[9] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 67 OF 123 ***
-
-      // Wavefunction(s) for diagram number 67
-      // (none)
-
-      // Amplitude(s) for diagram number 67
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 68 OF 123 ***
-
-      // Wavefunction(s) for diagram number 68
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 68
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 69 OF 123 ***
-
-      // Wavefunction(s) for diagram number 69
-      // (none)
-
-      // Amplitude(s) for diagram number 69
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 70 OF 123 ***
-
-      // Wavefunction(s) for diagram number 70
-      // (none)
-
-      // Amplitude(s) for diagram number 70
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 71 OF 123 ***
-
-      // Wavefunction(s) for diagram number 71
-      // (none)
-
-      // Amplitude(s) for diagram number 71
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-
-      // *** DIAGRAM 72 OF 123 ***
-
-      // Wavefunction(s) for diagram number 72
-      // (none)
-
-      // Amplitude(s) for diagram number 72
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-
-      // *** DIAGRAM 73 OF 123 ***
-
-      // Wavefunction(s) for diagram number 73
-      // (none)
-
-      // Amplitude(s) for diagram number 73
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 74 OF 123 ***
-
-      // Wavefunction(s) for diagram number 74
-      // (none)
-
-      // Amplitude(s) for diagram number 74
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 75 OF 123 ***
-
-      // Wavefunction(s) for diagram number 75
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 75
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 76 OF 123 ***
-
-      // Wavefunction(s) for diagram number 76
-      // (none)
-
-      // Amplitude(s) for diagram number 76
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 77 OF 123 ***
-
-      // Wavefunction(s) for diagram number 77
-      // (none)
-
-      // Amplitude(s) for diagram number 77
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-      jamp_sv[15] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 78 OF 123 ***
-
-      // Wavefunction(s) for diagram number 78
-      // (none)
-
-      // Amplitude(s) for diagram number 78
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 79 OF 123 ***
-
-      // Wavefunction(s) for diagram number 79
-      // (none)
-
-      // Amplitude(s) for diagram number 79
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 80 OF 123 ***
-
-      // Wavefunction(s) for diagram number 80
-      // (none)
-
-      // Amplitude(s) for diagram number 80
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 81 OF 123 ***
-
-      // Wavefunction(s) for diagram number 81
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
-
-      // Amplitude(s) for diagram number 81
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= amp_sv[0];
-
-      // *** DIAGRAM 82 OF 123 ***
-
-      // Wavefunction(s) for diagram number 82
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 82
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= amp_sv[0];
-
-      // *** DIAGRAM 83 OF 123 ***
-
-      // Wavefunction(s) for diagram number 83
-      // (none)
-
-      // Amplitude(s) for diagram number 83
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= amp_sv[0];
-
-      // *** DIAGRAM 84 OF 123 ***
-
-      // Wavefunction(s) for diagram number 84
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
-
-      // Amplitude(s) for diagram number 84
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= amp_sv[0];
-
-      // *** DIAGRAM 85 OF 123 ***
-
-      // Wavefunction(s) for diagram number 85
-      // (none)
-
-      // Amplitude(s) for diagram number 85
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 86 OF 123 ***
-
-      // Wavefunction(s) for diagram number 86
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
-
-      // Amplitude(s) for diagram number 86
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-
-      // *** DIAGRAM 87 OF 123 ***
-
-      // Wavefunction(s) for diagram number 87
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
-
-      // Amplitude(s) for diagram number 87
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] -= amp_sv[0];
-
-      // *** DIAGRAM 88 OF 123 ***
-
-      // Wavefunction(s) for diagram number 88
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
-
-      // Amplitude(s) for diagram number 88
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[13] -= amp_sv[0];
-
-      // *** DIAGRAM 89 OF 123 ***
-
-      // Wavefunction(s) for diagram number 89
-      // (none)
-
-      // Amplitude(s) for diagram number 89
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 90 OF 123 ***
-
-      // Wavefunction(s) for diagram number 90
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
-
-      // Amplitude(s) for diagram number 90
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[19] -= amp_sv[0];
-
-      // *** DIAGRAM 91 OF 123 ***
-
-      // Wavefunction(s) for diagram number 91
-      // (none)
-
-      // Amplitude(s) for diagram number 91
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 92 OF 123 ***
-
-      // Wavefunction(s) for diagram number 92
-      // (none)
-
-      // Amplitude(s) for diagram number 92
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-
-      // *** DIAGRAM 93 OF 123 ***
-
-      // Wavefunction(s) for diagram number 93
-      // (none)
-
-      // Amplitude(s) for diagram number 93
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 94 OF 123 ***
-
-      // Wavefunction(s) for diagram number 94
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
-
-      // Amplitude(s) for diagram number 94
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 95 OF 123 ***
-
-      // Wavefunction(s) for diagram number 95
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
-
-      // Amplitude(s) for diagram number 95
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 96 OF 123 ***
-
-      // Wavefunction(s) for diagram number 96
-      // (none)
-
-      // Amplitude(s) for diagram number 96
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-
-      // *** DIAGRAM 97 OF 123 ***
-
-      // Wavefunction(s) for diagram number 97
-      // (none)
-
-      // Amplitude(s) for diagram number 97
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 98 OF 123 ***
-
-      // Wavefunction(s) for diagram number 98
-      // (none)
-
-      // Amplitude(s) for diagram number 98
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-
-      // *** DIAGRAM 99 OF 123 ***
-
-      // Wavefunction(s) for diagram number 99
-      // (none)
-
-      // Amplitude(s) for diagram number 99
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 100 OF 123 ***
-
-      // Wavefunction(s) for diagram number 100
-      // (none)
-
-      // Amplitude(s) for diagram number 100
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 101 OF 123 ***
-
-      // Wavefunction(s) for diagram number 101
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 101
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 102 OF 123 ***
-
-      // Wavefunction(s) for diagram number 102
-      // (none)
-
-      // Amplitude(s) for diagram number 102
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 103 OF 123 ***
-
-      // Wavefunction(s) for diagram number 103
-      // (none)
-
-      // Amplitude(s) for diagram number 103
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-
-      // *** DIAGRAM 104 OF 123 ***
-
-      // Wavefunction(s) for diagram number 104
-      // (none)
-
-      // Amplitude(s) for diagram number 104
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 105 OF 123 ***
-
-      // Wavefunction(s) for diagram number 105
-      // (none)
-
-      // Amplitude(s) for diagram number 105
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-
-      // *** DIAGRAM 106 OF 123 ***
-
-      // Wavefunction(s) for diagram number 106
-      // (none)
-
-      // Amplitude(s) for diagram number 106
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 107 OF 123 ***
-
-      // Wavefunction(s) for diagram number 107
-      // (none)
-
-      // Amplitude(s) for diagram number 107
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 108 OF 123 ***
-
-      // Wavefunction(s) for diagram number 108
-      // (none)
-
-      // Amplitude(s) for diagram number 108
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 109 OF 123 ***
-
-      // Wavefunction(s) for diagram number 109
-      // (none)
-
-      // Amplitude(s) for diagram number 109
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 110 OF 123 ***
-
-      // Wavefunction(s) for diagram number 110
-      // (none)
-
-      // Amplitude(s) for diagram number 110
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[12] -= amp_sv[0];
-
-      // *** DIAGRAM 111 OF 123 ***
-
-      // Wavefunction(s) for diagram number 111
-      // (none)
-
-      // Amplitude(s) for diagram number 111
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 112 OF 123 ***
-
-      // Wavefunction(s) for diagram number 112
-      // (none)
-
-      // Amplitude(s) for diagram number 112
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[18] -= amp_sv[0];
-
-      // *** DIAGRAM 113 OF 123 ***
-
-      // Wavefunction(s) for diagram number 113
-      // (none)
-
-      // Amplitude(s) for diagram number 113
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 114 OF 123 ***
-
-      // Wavefunction(s) for diagram number 114
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
-
-      // Amplitude(s) for diagram number 114
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 115 OF 123 ***
-
-      // Wavefunction(s) for diagram number 115
-      // (none)
-
-      // Amplitude(s) for diagram number 115
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[23] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[19] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[21] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[18] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      jamp_sv[23] -= amp_sv[0];
-
-      // *** DIAGRAM 116 OF 123 ***
-
-      // Wavefunction(s) for diagram number 116
-      // (none)
-
-      // Amplitude(s) for diagram number 116
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[8] -= amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= amp_sv[0];
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[14] -= amp_sv[0];
-
-      // *** DIAGRAM 117 OF 123 ***
-
-      // Wavefunction(s) for diagram number 117
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-
-      // Amplitude(s) for diagram number 117
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[12] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[14] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[18] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[20] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 118 OF 123 ***
-
-      // Wavefunction(s) for diagram number 118
-      // (none)
-
-      // Amplitude(s) for diagram number 118
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] += amp_sv[0];
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[17] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[13] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[15] -= amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[12] -= amp_sv[0];
-      jamp_sv[14] += amp_sv[0];
-      jamp_sv[16] += amp_sv[0];
-      jamp_sv[17] -= amp_sv[0];
-
-      // *** DIAGRAM 119 OF 123 ***
-
-      // Wavefunction(s) for diagram number 119
-      // (none)
-
-      // Amplitude(s) for diagram number 119
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[20] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[10] -= amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[1] -= amp_sv[0];
-      jamp_sv[7] += amp_sv[0];
-      jamp_sv[18] += amp_sv[0];
-      jamp_sv[20] -= amp_sv[0];
-
-      // *** DIAGRAM 120 OF 123 ***
-
-      // Wavefunction(s) for diagram number 120
-      VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
-      VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
-      VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
-
-      // Amplitude(s) for diagram number 120
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] += amp_sv[0];
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[11] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[7] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[9] -= amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[6] -= amp_sv[0];
-      jamp_sv[8] += amp_sv[0];
-      jamp_sv[10] += amp_sv[0];
-      jamp_sv[11] -= amp_sv[0];
-
-      // *** DIAGRAM 121 OF 123 ***
-
-      // Wavefunction(s) for diagram number 121
-      // (none)
-
-      // Amplitude(s) for diagram number 121
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] += amp_sv[0];
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[22] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[5] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[16] -= amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= amp_sv[0];
-      jamp_sv[13] += amp_sv[0];
-      jamp_sv[19] += amp_sv[0];
-      jamp_sv[22] -= amp_sv[0];
-
-      // *** DIAGRAM 122 OF 123 ***
-
-      // Wavefunction(s) for diagram number 122
-      // (none)
-
-      // Amplitude(s) for diagram number 122
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[16] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[13] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[19] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[22] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 123 OF 123 ***
-
-      // Wavefunction(s) for diagram number 123
-      // (none)
-
-      // Amplitude(s) for diagram number 123
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[17] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[15] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[21] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[23] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gg_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
-
-      // The color matrix (initialize all array elements, with ncolor=24)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
-        { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
-        { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
-        { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
-        { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
-        { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
-        { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
-        { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
-        { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
-        { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
-        { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
-        { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
-        { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
-        { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
-        { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
-        { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
-        { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
-        { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
-        { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
-        { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
-        { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
-        { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
-        { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
-        { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 123 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram73, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram74, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram75, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram76, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram77, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram78, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram79, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram80, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram81, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram82, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram83, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram84, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram85, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram86, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram87, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram88, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram89, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram90, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram91, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram92, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram93, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram94, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram95, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram96, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram97, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram98, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram99, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram100, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram101, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram102, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram103, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram104, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram105, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram106, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram107, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram108, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram109, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram110, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram111, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram112, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram113, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram114, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram115, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram116, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram117, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram118, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram119, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram120, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram121, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram122, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram123, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram73( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram74( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram75( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram76( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram77( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram78( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram79( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram80( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram81( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram82( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram83( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram84( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram85( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram86( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram87( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram88( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram89( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram90( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram91( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram92( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram93( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram94( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram95( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram96( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram97( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram98( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram99( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram100( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram101( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram102( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram103( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram104( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram105( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram106( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram107( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram108( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram109( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram110( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram111( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram112( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram113( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram114( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram115( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram116( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram117( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram118( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram119( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram120( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram121( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram122( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram123( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -2718,7 +738,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -2753,6 +777,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -2795,6 +823,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -2897,26 +929,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -2924,25 +956,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -2955,7 +1002,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -2965,26 +1012,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -2995,17 +1043,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -3042,35 +1096,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -3080,20 +1316,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -3105,17 +1335,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -3141,93 +1374,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -3269,7 +1472,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -3292,7 +1495,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -3301,25 +1504,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -3329,8 +1538,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -3346,11 +1557,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -3452,14 +1664,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
index b6e3ba16d4..22c61c860f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 123; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 24; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 26; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 159;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
index 850bc73f22..1418b77839 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
index 7af9753fb7..60103eb65c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..02db3d0204
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.cc
@@ -0,0 +1,405 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54 }; // 1-D array[24]
+
+  // The color matrix (initialize all array elements, with ncolor=24)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 512, -64, -64, 8, 8, 80, -64, 8, 8, -1, -1, -10, 8, -1, 80, -10, 71, 62, -1, -10, -10, 62, 62, -28 },
+    { -64, 512, 8, 80, -64, 8, 8, -64, -1, -10, 8, -1, -1, -10, -10, 62, 62, -28, 8, -1, 80, -10, 71, 62 },
+    { -64, 8, 512, -64, 80, 8, 8, -1, 80, -10, 71, 62, -64, 8, 8, -1, -1, -10, -10, -1, 62, -28, -10, 62 },
+    { 8, 80, -64, 512, 8, -64, -1, -10, -10, 62, 62, -28, 8, -64, -1, -10, 8, -1, -1, 8, 71, 62, 80, -10 },
+    { 8, -64, 80, 8, 512, -64, -1, 8, 71, 62, 80, -10, -10, -1, 62, -28, -10, 62, -64, 8, 8, -1, -1, -10 },
+    { 80, 8, 8, -64, -64, 512, -10, -1, 62, -28, -10, 62, -1, 8, 71, 62, 80, -10, 8, -64, -1, -10, 8, -1 },
+    { -64, 8, 8, -1, -1, -10, 512, -64, -64, 8, 8, 80, 80, -10, 8, -1, 62, 71, -10, 62, -1, -10, -28, 62 },
+    { 8, -64, -1, -10, 8, -1, -64, 512, 8, 80, -64, 8, -10, 62, -1, -10, -28, 62, 80, -10, 8, -1, 62, 71 },
+    { 8, -1, 80, -10, 71, 62, -64, 8, 512, -64, 80, 8, 8, -1, -64, 8, -10, -1, 62, -28, -10, -1, 62, -10 },
+    { -1, -10, -10, 62, 62, -28, 8, 80, -64, 512, 8, -64, -1, -10, 8, -64, -1, 8, 71, 62, -1, 8, -10, 80 },
+    { -1, 8, 71, 62, 80, -10, 8, -64, 80, 8, 512, -64, 62, -28, -10, -1, 62, -10, 8, -1, -64, 8, -10, -1 },
+    { -10, -1, 62, -28, -10, 62, 80, 8, 8, -64, -64, 512, 71, 62, -1, 8, -10, 80, -1, -10, 8, -64, -1, 8 },
+    { 8, -1, -64, 8, -10, -1, 80, -10, 8, -1, 62, 71, 512, -64, -64, 8, 8, 80, 62, -10, -28, 62, -1, -10 },
+    { -1, -10, 8, -64, -1, 8, -10, 62, -1, -10, -28, 62, -64, 512, 8, 80, -64, 8, -10, 80, 62, 71, 8, -1 },
+    { 80, -10, 8, -1, 62, 71, 8, -1, -64, 8, -10, -1, -64, 8, 512, -64, 80, 8, -28, 62, 62, -10, -10, -1 },
+    { -10, 62, -1, -10, -28, 62, -1, -10, 8, -64, -1, 8, 8, 80, -64, 512, 8, -64, 62, 71, -10, 80, -1, 8 },
+    { 71, 62, -1, 8, -10, 80, 62, -28, -10, -1, 62, -10, 8, -64, 80, 8, 512, -64, -1, 8, -10, -1, -64, 8 },
+    { 62, -28, -10, -1, 62, -10, 71, 62, -1, 8, -10, 80, 80, 8, 8, -64, -64, 512, -10, -1, -1, 8, 8, -64 },
+    { -1, 8, -10, -1, -64, 8, -10, 80, 62, 71, 8, -1, 62, -10, -28, 62, -1, -10, 512, -64, -64, 8, 8, 80 },
+    { -10, -1, -1, 8, 8, -64, 62, -10, -28, 62, -1, -10, -10, 80, 62, 71, 8, -1, -64, 512, 8, 80, -64, 8 },
+    { -10, 80, 62, 71, 8, -1, -1, 8, -10, -1, -64, 8, -28, 62, 62, -10, -10, -1, -64, 8, 512, -64, 80, 8 },
+    { 62, -10, -28, 62, -1, -10, -10, -1, -1, 8, 8, -64, 62, 71, -10, 80, -1, 8, 8, 80, -64, 512, 8, -64 },
+    { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 },
+    { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagrams.h
new file mode 100644
index 0000000000..08f07c1187
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/diagrams.h
@@ -0,0 +1,4120 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 123 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    // Amplitude(s) for diagram number 1
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[4], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 123 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 123 ***
+    // Wavefunction(s) for diagram number 3
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 123 ***
+    // Wavefunction(s) for diagram number 4
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 4
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 123 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 123 ***
+    // Wavefunction(s) for diagram number 6
+    // (none)
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 123 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 123 ***
+    // Wavefunction(s) for diagram number 8
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 123 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 123 ***
+    // Wavefunction(s) for diagram number 10
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 123 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[16], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 123 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 123 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[16], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 123 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 123 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 123 ***
+    // Wavefunction(s) for diagram number 16
+    // (none)
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 123 ***
+    // Wavefunction(s) for diagram number 17
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[8], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 123 ***
+    // Wavefunction(s) for diagram number 18
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 18
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 123 ***
+    // Wavefunction(s) for diagram number 19
+    // (none)
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 123 ***
+    // Wavefunction(s) for diagram number 20
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 123 ***
+    // Wavefunction(s) for diagram number 21
+    // (none)
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 123 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 123 ***
+    // Wavefunction(s) for diagram number 23
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[18] );
+    // Amplitude(s) for diagram number 23
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 123 ***
+    // Wavefunction(s) for diagram number 24
+    // (none)
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 123 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 123 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[19] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[19], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 123 ***
+    // Wavefunction(s) for diagram number 27
+    // (none)
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 123 ***
+    // Wavefunction(s) for diagram number 28
+    // (none)
+    // Amplitude(s) for diagram number 28
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[19], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 123 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[8], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 123 ***
+    // Wavefunction(s) for diagram number 30
+    // (none)
+    // Amplitude(s) for diagram number 30
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[19], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 123 ***
+    // Wavefunction(s) for diagram number 31
+    // (none)
+    // Amplitude(s) for diagram number 31
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 123 ***
+    // Wavefunction(s) for diagram number 32
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[17] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[19] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 123 ***
+    // Wavefunction(s) for diagram number 33
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 123 ***
+    // Wavefunction(s) for diagram number 34
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 34
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 123 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 123 ***
+    // Wavefunction(s) for diagram number 36
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 36
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram37( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 37 OF 123 ***
+    // Wavefunction(s) for diagram number 37
+    // (none)
+    // Amplitude(s) for diagram number 37
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram38( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 38 OF 123 ***
+    // Wavefunction(s) for diagram number 38
+    // (none)
+    // Amplitude(s) for diagram number 38
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram39( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 39 OF 123 ***
+    // Wavefunction(s) for diagram number 39
+    // (none)
+    // Amplitude(s) for diagram number 39
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram40( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 40 OF 123 ***
+    // Wavefunction(s) for diagram number 40
+    // (none)
+    // Amplitude(s) for diagram number 40
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram41( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 41 OF 123 ***
+    // Wavefunction(s) for diagram number 41
+    // (none)
+    // Amplitude(s) for diagram number 41
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[11], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram42( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 42 OF 123 ***
+    // Wavefunction(s) for diagram number 42
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 42
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram43( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 43 OF 123 ***
+    // Wavefunction(s) for diagram number 43
+    // (none)
+    // Amplitude(s) for diagram number 43
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram44( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 44 OF 123 ***
+    // Wavefunction(s) for diagram number 44
+    // (none)
+    // Amplitude(s) for diagram number 44
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram45( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 45 OF 123 ***
+    // Wavefunction(s) for diagram number 45
+    // (none)
+    // Amplitude(s) for diagram number 45
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram46( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 46 OF 123 ***
+    // Wavefunction(s) for diagram number 46
+    // (none)
+    // Amplitude(s) for diagram number 46
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[23], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram47( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 47 OF 123 ***
+    // Wavefunction(s) for diagram number 47
+    // (none)
+    // Amplitude(s) for diagram number 47
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram48( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 48 OF 123 ***
+    // Wavefunction(s) for diagram number 48
+    // (none)
+    // Amplitude(s) for diagram number 48
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[19], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram49( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 49 OF 123 ***
+    // Wavefunction(s) for diagram number 49
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 49
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram50( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 50 OF 123 ***
+    // Wavefunction(s) for diagram number 50
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 50
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram51( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 51 OF 123 ***
+    // Wavefunction(s) for diagram number 51
+    // (none)
+    // Amplitude(s) for diagram number 51
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[9], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram52( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 52 OF 123 ***
+    // Wavefunction(s) for diagram number 52
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[12], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 52
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram53( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 53 OF 123 ***
+    // Wavefunction(s) for diagram number 53
+    // (none)
+    // Amplitude(s) for diagram number 53
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram54( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 54 OF 123 ***
+    // Wavefunction(s) for diagram number 54
+    // (none)
+    // Amplitude(s) for diagram number 54
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram55( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 55 OF 123 ***
+    // Wavefunction(s) for diagram number 55
+    // (none)
+    // Amplitude(s) for diagram number 55
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram56( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 56 OF 123 ***
+    // Wavefunction(s) for diagram number 56
+    // (none)
+    // Amplitude(s) for diagram number 56
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram57( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 57 OF 123 ***
+    // Wavefunction(s) for diagram number 57
+    // (none)
+    // Amplitude(s) for diagram number 57
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[18], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram58( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 58 OF 123 ***
+    // Wavefunction(s) for diagram number 58
+    // (none)
+    // Amplitude(s) for diagram number 58
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram59( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 59 OF 123 ***
+    // Wavefunction(s) for diagram number 59
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 59
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[21], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram60( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 60 OF 123 ***
+    // Wavefunction(s) for diagram number 60
+    // (none)
+    // Amplitude(s) for diagram number 60
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram61( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 61 OF 123 ***
+    // Wavefunction(s) for diagram number 61
+    // (none)
+    // Amplitude(s) for diagram number 61
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram62( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 62 OF 123 ***
+    // Wavefunction(s) for diagram number 62
+    // (none)
+    // Amplitude(s) for diagram number 62
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram63( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 63 OF 123 ***
+    // Wavefunction(s) for diagram number 63
+    // (none)
+    // Amplitude(s) for diagram number 63
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram64( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 64 OF 123 ***
+    // Wavefunction(s) for diagram number 64
+    // (none)
+    // Amplitude(s) for diagram number 64
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram65( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 65 OF 123 ***
+    // Wavefunction(s) for diagram number 65
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[20] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 65
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram66( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 66 OF 123 ***
+    // Wavefunction(s) for diagram number 66
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 66
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram67( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 67 OF 123 ***
+    // Wavefunction(s) for diagram number 67
+    // (none)
+    // Amplitude(s) for diagram number 67
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[9], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 67 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram68( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 68 OF 123 ***
+    // Wavefunction(s) for diagram number 68
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[20], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 68
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram69( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 69 OF 123 ***
+    // Wavefunction(s) for diagram number 69
+    // (none)
+    // Amplitude(s) for diagram number 69
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram70( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 70 OF 123 ***
+    // Wavefunction(s) for diagram number 70
+    // (none)
+    // Amplitude(s) for diagram number 70
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[11], w_fp[20], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 70 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram71( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 71 OF 123 ***
+    // Wavefunction(s) for diagram number 71
+    // (none)
+    // Amplitude(s) for diagram number 71
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram72( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 72 OF 123 ***
+    // Wavefunction(s) for diagram number 72
+    // (none)
+    // Amplitude(s) for diagram number 72
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram73( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 73 OF 123 ***
+    // Wavefunction(s) for diagram number 73
+    // (none)
+    // Amplitude(s) for diagram number 73
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[6], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 73 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram74( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 74 OF 123 ***
+    // Wavefunction(s) for diagram number 74
+    // (none)
+    // Amplitude(s) for diagram number 74
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram75( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 75 OF 123 ***
+    // Wavefunction(s) for diagram number 75
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[20], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 75
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 75 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram76( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 76 OF 123 ***
+    // Wavefunction(s) for diagram number 76
+    // (none)
+    // Amplitude(s) for diagram number 76
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 76 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram77( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 77 OF 123 ***
+    // Wavefunction(s) for diagram number 77
+    // (none)
+    // Amplitude(s) for diagram number 77
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 77 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram78( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 78 OF 123 ***
+    // Wavefunction(s) for diagram number 78
+    // (none)
+    // Amplitude(s) for diagram number 78
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 78 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram79( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 79 OF 123 ***
+    // Wavefunction(s) for diagram number 79
+    // (none)
+    // Amplitude(s) for diagram number 79
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 79 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram80( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 80 OF 123 ***
+    // Wavefunction(s) for diagram number 80
+    // (none)
+    // Amplitude(s) for diagram number 80
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 80 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram81( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 81 OF 123 ***
+    // Wavefunction(s) for diagram number 81
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[23] );
+    // Amplitude(s) for diagram number 81
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[23], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 81 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram82( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 82 OF 123 ***
+    // Wavefunction(s) for diagram number 82
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 82
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 82 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram83( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 83 OF 123 ***
+    // Wavefunction(s) for diagram number 83
+    // (none)
+    // Amplitude(s) for diagram number 83
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[23], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 83 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram84( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 84 OF 123 ***
+    // Wavefunction(s) for diagram number 84
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[21] );
+    // Amplitude(s) for diagram number 84
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[9], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 84 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram85( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 85 OF 123 ***
+    // Wavefunction(s) for diagram number 85
+    // (none)
+    // Amplitude(s) for diagram number 85
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[23], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 85 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram86( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 86 OF 123 ***
+    // Wavefunction(s) for diagram number 86
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[23] );
+    // Amplitude(s) for diagram number 86
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 86 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram87( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 87 OF 123 ***
+    // Wavefunction(s) for diagram number 87
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[22] );
+    // Amplitude(s) for diagram number 87
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[11], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 87 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram88( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 88 OF 123 ***
+    // Wavefunction(s) for diagram number 88
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[20] );
+    // Amplitude(s) for diagram number 88
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[20], w_fp[5], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 88 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram89( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 89 OF 123 ***
+    // Wavefunction(s) for diagram number 89
+    // (none)
+    // Amplitude(s) for diagram number 89
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[14], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 89 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram90( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 90 OF 123 ***
+    // Wavefunction(s) for diagram number 90
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[24] );
+    // Amplitude(s) for diagram number 90
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[24], w_fp[4], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 90 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram91( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 91 OF 123 ***
+    // Wavefunction(s) for diagram number 91
+    // (none)
+    // Amplitude(s) for diagram number 91
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[22], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 91 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram92( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 92 OF 123 ***
+    // Wavefunction(s) for diagram number 92
+    // (none)
+    // Amplitude(s) for diagram number 92
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[23], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 92 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram93( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 93 OF 123 ***
+    // Wavefunction(s) for diagram number 93
+    // (none)
+    // Amplitude(s) for diagram number 93
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], w_fp[5], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram94( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 94 OF 123 ***
+    // Wavefunction(s) for diagram number 94
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[0], 1.0, 0., 0., w_fp[22] );
+    // Amplitude(s) for diagram number 94
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[5], w_fp[22], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 94 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram95( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 95 OF 123 ***
+    // Wavefunction(s) for diagram number 95
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[25] );
+    // Amplitude(s) for diagram number 95
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[5], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 95 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram96( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 96 OF 123 ***
+    // Wavefunction(s) for diagram number 96
+    // (none)
+    // Amplitude(s) for diagram number 96
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 96 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram97( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 97 OF 123 ***
+    // Wavefunction(s) for diagram number 97
+    // (none)
+    // Amplitude(s) for diagram number 97
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[24], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 97 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram98( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 98 OF 123 ***
+    // Wavefunction(s) for diagram number 98
+    // (none)
+    // Amplitude(s) for diagram number 98
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[22], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 98 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram99( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 99 OF 123 ***
+    // Wavefunction(s) for diagram number 99
+    // (none)
+    // Amplitude(s) for diagram number 99
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 99 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram100( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 100 OF 123 ***
+    // Wavefunction(s) for diagram number 100
+    // (none)
+    // Amplitude(s) for diagram number 100
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], w_fp[7], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram101( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 101 OF 123 ***
+    // Wavefunction(s) for diagram number 101
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[18], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 101
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[7], w_fp[4], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 101 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram102( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 102 OF 123 ***
+    // Wavefunction(s) for diagram number 102
+    // (none)
+    // Amplitude(s) for diagram number 102
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[18], w_fp[4], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 102 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram103( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 103 OF 123 ***
+    // Wavefunction(s) for diagram number 103
+    // (none)
+    // Amplitude(s) for diagram number 103
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 103 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram104( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 104 OF 123 ***
+    // Wavefunction(s) for diagram number 104
+    // (none)
+    // Amplitude(s) for diagram number 104
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[20], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 104 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram105( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 105 OF 123 ***
+    // Wavefunction(s) for diagram number 105
+    // (none)
+    // Amplitude(s) for diagram number 105
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 105 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram106( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 106 OF 123 ***
+    // Wavefunction(s) for diagram number 106
+    // (none)
+    // Amplitude(s) for diagram number 106
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[18], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 106 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram107( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 107 OF 123 ***
+    // Wavefunction(s) for diagram number 107
+    // (none)
+    // Amplitude(s) for diagram number 107
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram108( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 108 OF 123 ***
+    // Wavefunction(s) for diagram number 108
+    // (none)
+    // Amplitude(s) for diagram number 108
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[25], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 108 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram109( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 109 OF 123 ***
+    // Wavefunction(s) for diagram number 109
+    // (none)
+    // Amplitude(s) for diagram number 109
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[23], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 109 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram110( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 110 OF 123 ***
+    // Wavefunction(s) for diagram number 110
+    // (none)
+    // Amplitude(s) for diagram number 110
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[20], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 110 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram111( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 111 OF 123 ***
+    // Wavefunction(s) for diagram number 111
+    // (none)
+    // Amplitude(s) for diagram number 111
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[11], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 111 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram112( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 112 OF 123 ***
+    // Wavefunction(s) for diagram number 112
+    // (none)
+    // Amplitude(s) for diagram number 112
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[24], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 112 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram113( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 113 OF 123 ***
+    // Wavefunction(s) for diagram number 113
+    // (none)
+    // Amplitude(s) for diagram number 113
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[14], w_fp[1], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 113 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram114( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 114 OF 123 ***
+    // Wavefunction(s) for diagram number 114
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[12] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[4], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    // Amplitude(s) for diagram number 114
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[5], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram115( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 115 OF 123 ***
+    // Wavefunction(s) for diagram number 115
+    // (none)
+    // Amplitude(s) for diagram number 115
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram116( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 116 OF 123 ***
+    // Wavefunction(s) for diagram number 116
+    // (none)
+    // Amplitude(s) for diagram number 116
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram117( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 117 OF 123 ***
+    // Wavefunction(s) for diagram number 117
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[21] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    // Amplitude(s) for diagram number 117
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[21], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[7], w_fp[4], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram118( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 118 OF 123 ***
+    // Wavefunction(s) for diagram number 118
+    // (none)
+    // Amplitude(s) for diagram number 118
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 12 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 14 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram119( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 119 OF 123 ***
+    // Wavefunction(s) for diagram number 119
+    // (none)
+    // Amplitude(s) for diagram number 119
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[21], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 18 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 20 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram120( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 120 OF 123 ***
+    // Wavefunction(s) for diagram number 120
+    VVVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[24] );
+    VVVV3P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[15] );
+    VVVV4P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[5], COUPs[2], 1.0, 0., 0., w_fp[13] );
+    // Amplitude(s) for diagram number 120
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram121( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 121 OF 123 ***
+    // Wavefunction(s) for diagram number 121
+    // (none)
+    // Amplitude(s) for diagram number 121
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[24], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[15], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[13], COUPs[1], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) += amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram122( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 122 OF 123 ***
+    // Wavefunction(s) for diagram number 122
+    // (none)
+    // Amplitude(s) for diagram number 122
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[24], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 16 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[1], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 13 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 19 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 22 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram123( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+              fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+              const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+              const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+              const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+              fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+              fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 123 OF 123 ***
+    // Wavefunction(s) for diagram number 123
+    // (none)
+    // Amplitude(s) for diagram number 123
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[17], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[19], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 17 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 15 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 21 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 23 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
index 39ecff768a..48a83737ca 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -398,7 +398,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(155)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -441,407 +442,81 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  1),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  1),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  1),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
+      DATA DENOM/54/
+      DATA (CF(I),I=  1, 24) /512,-128,-128,16,16,160,-128,16,16,-2,-2
+     $ ,-20,16,-2,160,-20,142,124,-2,-20,-20,124,124,-56/
 C     1 T(1,2,5,6,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  2),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  2),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  2),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
+      DATA (CF(I),I= 25, 47) /512,16,160,-128,16,16,-128,-2,-20,16,-2,
+     $ -2,-20,-20,124,124,-56,16,-2,160,-20,142,124/
 C     1 T(1,2,6,5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  3),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  3),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  3),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I= 48, 69) /512,-128,160,16,16,-2,160,-20,142,124,
+     $ -128,16,16,-2,-2,-20,-20,-2,124,-56,-20,124/
 C     1 T(1,5,2,6,3,4)
-      DATA (CF(I,  4),I=  1,  6) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I,  4),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I,  4),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  4),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
+      DATA (CF(I),I= 70, 90) /512,16,-128,-2,-20,-20,124,124,-56,16,
+     $ -128,-2,-20,16,-2,-2,16,142,124,160,-20/
 C     1 T(1,5,6,2,3,4)
-      DATA (CF(I,  5),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I,  5),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  5),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  5),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I= 91,110) /512,-128,-2,16,142,124,160,-20,-20,-2
+     $ ,124,-56,-20,124,-128,16,16,-2,-2,-20/
 C     1 T(1,6,2,5,3,4)
-      DATA (CF(I,  6),I=  1,  6) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I,  6),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  6),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I,  6),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=111,129) /512,-20,-2,124,-56,-20,124,-2,16,142,124
+     $ ,160,-20,16,-128,-2,-20,16,-2/
 C     1 T(1,6,5,2,3,4)
-      DATA (CF(I,  7),I=  1,  6) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I,  7),I=  7, 12) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I,  7),I= 13, 18) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I,  7),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
+      DATA (CF(I),I=130,147) /512,-128,-128,16,16,160,160,-20,16,-2
+     $ ,124,142,-20,124,-2,-20,-56,124/
 C     1 T(2,1,5,6,3,4)
-      DATA (CF(I,  8),I=  1,  6) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  8),I=  7, 12) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I,  8),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I,  8),I= 19, 24) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
+      DATA (CF(I),I=148,164) /512,16,160,-128,16,-20,124,-2,-20,-56
+     $ ,124,160,-20,16,-2,124,142/
 C     1 T(2,1,6,5,3,4)
-      DATA (CF(I,  9),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00/
-      DATA (CF(I,  9),I=  7, 12) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I,  9),I= 13, 18) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I,  9),I= 19, 24) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=165,180) /512,-128,160,16,16,-2,-128,16,-20,-2,124
+     $ ,-56,-20,-2,124,-20/
 C     1 T(2,5,1,6,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -5.185185185185185D-01/
-      DATA (CF(I, 10),I=  7, 12) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 10),I= 13, 18) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 10),I= 19, 24) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
+      DATA (CF(I),I=181,195) /512,16,-128,-2,-20,16,-128,-2,16,142,124
+     $ ,-2,16,-20,160/
 C     1 T(2,5,6,1,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.314814814814815D+00,1.148148148148148D
-     $ +00,1.481481481481481D+00,-1.851851851851852D-01/
-      DATA (CF(I, 11),I=  7, 12) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 11),I= 13, 18) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 11),I= 19, 24) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=196,209) /512,-128,124,-56,-20,-2,124,-20,16,-2,
+     $ -128,16,-20,-2/
 C     1 T(2,6,1,5,3,4)
-      DATA (CF(I, 12),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 12),I=  7, 12) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 12),I= 13, 18) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 12),I= 19, 24) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=210,222) /512,142,124,-2,16,-20,160,-2,-20,16,-128
+     $ ,-2,16/
 C     1 T(2,6,5,1,3,4)
-      DATA (CF(I, 13),I=  1,  6) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 13),I=  7, 12) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 13),I= 13, 18) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
-      DATA (CF(I, 13),I= 19, 24) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
+      DATA (CF(I),I=223,234) /512,-128,-128,16,16,160,124,-20,-56,124,
+     $ -2,-20/
 C     1 T(5,1,2,6,3,4)
-      DATA (CF(I, 14),I=  1,  6) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 14),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 14),I= 13, 18) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
-      DATA (CF(I, 14),I= 19, 24) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
+      DATA (CF(I),I=235,245) /512,16,160,-128,16,-20,160,124,142,16,-2/
 C     1 T(5,1,6,2,3,4)
-      DATA (CF(I, 15),I=  1,  6) /1.481481481481481D+00,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,1.314814814814815D
-     $ +00/
-      DATA (CF(I, 15),I=  7, 12) /1.481481481481481D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 15),I= 13, 18) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
-      DATA (CF(I, 15),I= 19, 24) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
+      DATA (CF(I),I=246,255) /512,-128,160,16,-56,124,124,-20,-20,-2/
 C     1 T(5,2,1,6,3,4)
-      DATA (CF(I, 16),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00/
-      DATA (CF(I, 16),I=  7, 12) /-1.851851851851852D-02,
-     $ -1.851851851851852D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 16),I= 13, 18) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 16),I= 19, 24) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=256,264) /512,16,-128,124,142,-20,160,-2,16/
 C     1 T(5,2,6,1,3,4)
-      DATA (CF(I, 17),I=  1,  6) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 17),I=  7, 12) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 17),I= 13, 18) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
-      DATA (CF(I, 17),I= 19, 24) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
+      DATA (CF(I),I=265,272) /512,-128,-2,16,-20,-2,-128,16/
 C     1 T(5,6,1,2,3,4)
-      DATA (CF(I, 18),I=  1,  6) /1.148148148148148D+00,
-     $ -5.185185185185185D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,1.148148148148148D+00,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 18),I=  7, 12) /1.314814814814815D+00
-     $ ,1.148148148148148D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01
-     $ ,1.481481481481481D+00/
-      DATA (CF(I, 18),I= 13, 18) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
-      DATA (CF(I, 18),I= 19, 24) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=273,279) /512,-20,-2,-2,16,16,-128/
 C     1 T(5,6,2,1,3,4)
-      DATA (CF(I, 19),I=  1,  6) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 19),I=  7, 12) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 19),I= 13, 18) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 19),I= 19, 24) /9.481481481481481D+00,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,1.481481481481481D
-     $ +00/
+      DATA (CF(I),I=280,285) /512,-128,-128,16,16,160/
 C     1 T(6,1,2,5,3,4)
-      DATA (CF(I, 20),I=  1,  6) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 20),I=  7, 12) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 20),I= 13, 18) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 20),I= 19, 24) /-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,1.481481481481481D
-     $ +00,-1.185185185185185D+00,1.481481481481481D-01/
+      DATA (CF(I),I=286,290) /512,16,160,-128,16/
 C     1 T(6,1,5,2,3,4)
-      DATA (CF(I, 21),I=  1,  6) /-1.851851851851852D-01
-     $ ,1.481481481481481D+00,1.148148148148148D+00,1.314814814814815D
-     $ +00,1.481481481481481D-01,-1.851851851851852D-02/
-      DATA (CF(I, 21),I=  7, 12) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 21),I= 13, 18) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 21),I= 19, 24) /-1.185185185185185D+00
-     $ ,1.481481481481481D-01,9.481481481481481D+00,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01/
+      DATA (CF(I),I=291,294) /512,-128,160,16/
 C     1 T(6,2,1,5,3,4)
-      DATA (CF(I, 22),I=  1,  6) /1.148148148148148D+00,
-     $ -1.851851851851852D-01,-5.185185185185185D-01
-     $ ,1.148148148148148D+00,-1.851851851851852D-02,
-     $ -1.851851851851852D-01/
-      DATA (CF(I, 22),I=  7, 12) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 22),I= 13, 18) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 22),I= 19, 24) /1.481481481481481D-01
-     $ ,1.481481481481481D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
+      DATA (CF(I),I=295,297) /512,16,-128/
 C     1 T(6,2,5,1,3,4)
-      DATA (CF(I, 23),I=  1,  6) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I=  7, 12) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 23),I= 13, 18) /-1.851851851851852D-02
-     $ ,1.481481481481481D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.185185185185185D+00
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 23),I= 19, 24) /1.481481481481481D-01,
-     $ -1.185185185185185D+00,1.481481481481481D+00,1.481481481481481D
-     $ -01,9.481481481481481D+00,-1.185185185185185D+00/
+      DATA (CF(I),I=298,299) /512,-128/
 C     1 T(6,5,1,2,3,4)
-      DATA (CF(I, 24),I=  1,  6) /-5.185185185185185D-01
-     $ ,1.148148148148148D+00,1.148148148148148D+00,
-     $ -1.851851851851852D-01,-1.851851851851852D-01,
-     $ -1.851851851851852D-02/
-      DATA (CF(I, 24),I=  7, 12) /1.148148148148148D+00
-     $ ,1.314814814814815D+00,-1.851851851851852D-01
-     $ ,1.481481481481481D+00,-1.851851851851852D-02
-     $ ,1.481481481481481D-01/
-      DATA (CF(I, 24),I= 13, 18) /-1.851851851851852D-01,
-     $ -1.851851851851852D-02,-1.851851851851852D-02
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00/
-      DATA (CF(I, 24),I= 19, 24) /1.481481481481481D+00
-     $ ,1.481481481481481D-01,1.481481481481481D-01,
-     $ -1.185185185185185D+00,-1.185185185185185D+00
-     $ ,9.481481481481481D+00/
+      DATA (CF(I),I=300,300) /512/
 C     1 T(6,5,2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -1547,10 +1222,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -1559,6 +1236,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(2)=AMP2(2)+AMP(4)*DCONJG(AMP(4))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
index 70d0f7cb8e..da1b425ff0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,804 +282,205 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 36 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 36 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 36 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 36 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 36 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 36 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 36 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 36 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 36 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 36 ***
-
-      // Wavefunction(s) for diagram number 10
-      // (none)
-
-      // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 36 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 36 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 36 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 36 ***
-
-      // Wavefunction(s) for diagram number 14
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 15 OF 36 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 16 OF 36 ***
-
-      // Wavefunction(s) for diagram number 16
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 17 OF 36 ***
-
-      // Wavefunction(s) for diagram number 17
-      // (none)
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 18 OF 36 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 19 OF 36 ***
-
-      // Wavefunction(s) for diagram number 19
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 36 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 21 OF 36 ***
-
-      // Wavefunction(s) for diagram number 21
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 36 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 36 ***
-
-      // Wavefunction(s) for diagram number 23
-      // (none)
-
-      // Amplitude(s) for diagram number 23
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 24 OF 36 ***
-
-      // Wavefunction(s) for diagram number 24
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 36 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 36 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 36 ***
-
-      // Wavefunction(s) for diagram number 27
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 36 ***
-
-      // Wavefunction(s) for diagram number 28
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 36 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 30 OF 36 ***
-
-      // Wavefunction(s) for diagram number 30
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 36 ***
-
-      // Wavefunction(s) for diagram number 31
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 32 OF 36 ***
-
-      // Wavefunction(s) for diagram number 32
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 36 ***
-
-      // Wavefunction(s) for diagram number 33
-      // (none)
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 36 ***
-
-      // Wavefunction(s) for diagram number 34
-      // (none)
-
-      // Amplitude(s) for diagram number 34
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 36 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 36 ***
-
-      // Wavefunction(s) for diagram number 36
-      // (none)
-
-      // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gg_ttxuux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
-        { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
-        { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
-        { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
-        { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 36 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -1113,7 +567,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +606,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +652,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1292,26 +758,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +785,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +831,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +841,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1390,17 +872,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,35 +925,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1475,20 +1145,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,17 +1164,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1536,93 +1203,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1301,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1324,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,25 +1333,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1724,8 +1367,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1386,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1493,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
index 84a8066974..35a0f978a3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 38;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
index 49cac7230f..ef6c2d98a2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
index 6e1c3f774f..44b8eb0a9b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -231,7 +231,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -305,6 +305,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -388,12 +392,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -517,6 +521,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc
new file mode 100644
index 0000000000..065151d9f1
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.cc
@@ -0,0 +1,393 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
+    { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
+    { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
+    { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
+    { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagrams.h
new file mode 100644
index 0000000000..321eb5303f
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/diagrams.h
@@ -0,0 +1,1132 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 36 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 36 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 36 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 36 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 36 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 36 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 36 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 36 ***
+    // Wavefunction(s) for diagram number 8
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 36 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 36 ***
+    // Wavefunction(s) for diagram number 10
+    // (none)
+    // Amplitude(s) for diagram number 10
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 36 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 36 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 36 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 36 ***
+    // Wavefunction(s) for diagram number 14
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 36 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 36 ***
+    // Wavefunction(s) for diagram number 16
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 36 ***
+    // Wavefunction(s) for diagram number 17
+    // (none)
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 36 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 36 ***
+    // Wavefunction(s) for diagram number 19
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 36 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 36 ***
+    // Wavefunction(s) for diagram number 21
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 36 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 36 ***
+    // Wavefunction(s) for diagram number 23
+    // (none)
+    // Amplitude(s) for diagram number 23
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 36 ***
+    // Wavefunction(s) for diagram number 24
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 36 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 36 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 36 ***
+    // Wavefunction(s) for diagram number 27
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 36 ***
+    // Wavefunction(s) for diagram number 28
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 28
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 36 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 36 ***
+    // Wavefunction(s) for diagram number 30
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 30
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 36 ***
+    // Wavefunction(s) for diagram number 31
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 36 ***
+    // Wavefunction(s) for diagram number 32
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 36 ***
+    // Wavefunction(s) for diagram number 33
+    // (none)
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 36 ***
+    // Wavefunction(s) for diagram number 34
+    // (none)
+    // Amplitude(s) for diagram number 34
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 36 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 36 ***
+    // Wavefunction(s) for diagram number 36
+    // (none)
+    // Amplitude(s) for diagram number 36
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
index 9fb8f4d180..ffaa3cde67 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,111 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,0.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  1),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/
 C     1 T(1,2,3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,0.000000000000000D+00/
-      DATA (CF(I,  2),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/
 C     1 T(1,2,3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/
 C     1 T(1,2,5,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  4),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/
 C     1 T(1,2,5,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/
 C     1 T(1,3,4) T(2,5,6)
-      DATA (CF(I,  6),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01/
-      DATA (CF(I,  6),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/
 C     1 T(1,3,6) T(2,5,4)
-      DATA (CF(I,  7),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/
 C     1 T(1,5,4) T(2,3,6)
-      DATA (CF(I,  8),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 64, 68) /48,0,32,-4,0/
 C     1 T(1,5,6) T(2,3,4)
-      DATA (CF(I,  9),I=  1,  6) /-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(2,1,3,4) T(5,6)
-      DATA (CF(I, 10),I=  1,  6) /-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,0.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(2,1,3,6) T(5,4)
-      DATA (CF(I, 11),I=  1,  6) /-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
-      DATA (CF(I, 11),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(2,1,5,4) T(3,6)
-      DATA (CF(I, 12),I=  1,  6) /2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
-      DATA (CF(I, 12),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(2,1,5,6) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -760,10 +694,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -772,6 +708,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
index ac4bf091b7..421e3e13fc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,804 +282,205 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 36 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 36 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 36 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 36 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 36 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 36 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 36 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 36 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 36 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 36 ***
-
-      // Wavefunction(s) for diagram number 10
-      // (none)
-
-      // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 36 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 36 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 36 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 36 ***
-
-      // Wavefunction(s) for diagram number 14
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 15 OF 36 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 16 OF 36 ***
-
-      // Wavefunction(s) for diagram number 16
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 17 OF 36 ***
-
-      // Wavefunction(s) for diagram number 17
-      // (none)
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 18 OF 36 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 19 OF 36 ***
-
-      // Wavefunction(s) for diagram number 19
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += 1. / 6. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 36 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 21 OF 36 ***
-
-      // Wavefunction(s) for diagram number 21
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 36 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 36 ***
-
-      // Wavefunction(s) for diagram number 23
-      // (none)
-
-      // Amplitude(s) for diagram number 23
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 24 OF 36 ***
-
-      // Wavefunction(s) for diagram number 24
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 36 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 36 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 36 ***
-
-      // Wavefunction(s) for diagram number 27
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 36 ***
-
-      // Wavefunction(s) for diagram number 28
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 6. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 36 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 30 OF 36 ***
-
-      // Wavefunction(s) for diagram number 30
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 36 ***
-
-      // Wavefunction(s) for diagram number 31
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 32 OF 36 ***
-
-      // Wavefunction(s) for diagram number 32
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 36 ***
-
-      // Wavefunction(s) for diagram number 33
-      // (none)
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 36 ***
-
-      // Wavefunction(s) for diagram number 34
-      // (none)
-
-      // Amplitude(s) for diagram number 34
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 36 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 36 ***
-
-      // Wavefunction(s) for diagram number 36
-      // (none)
-
-      // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gu_ttxgu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 },
-        { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 },
-        { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 },
-        { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 },
-        { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 },
-        { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 },
-        { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 },
-        { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 },
-        { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 },
-        { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 },
-        { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 },
-        { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 36 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -1113,7 +567,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +606,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +652,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1292,26 +758,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +785,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +831,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +841,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1390,17 +872,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,35 +925,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1473,22 +1143,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,17 +1164,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1536,93 +1203,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1301,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1324,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,25 +1333,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1724,8 +1367,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1386,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1493,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
index f75309f403..1a5a996480 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 38;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
index 47e378e255..eb269f804d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
index 756e98881c..3d6ffe6ba1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc
new file mode 100644
index 0000000000..088e843da8
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.cc
@@ -0,0 +1,393 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 0, 16, -2, 0, 16, 6, 0, 16, 0, -2 },
+    { 16, 48, 16, 0, 0, -2, 6, 16, 16, 0, -2, 0 },
+    { 0, 16, 48, 16, 16, 6, -2, 0, 6, -2, -6, -2 },
+    { 16, 0, 16, 48, 6, 16, 0, -2, -2, 6, -2, -6 },
+    { -2, 0, 16, 6, 48, 16, 0, 16, -2, -6, -2, 6 },
+    { 0, -2, 6, 16, 16, 48, 16, 0, -6, -2, 6, -2 },
+    { 16, 6, -2, 0, 0, 16, 48, 16, -2, 0, 16, 0 },
+    { 6, 16, 0, -2, 16, 0, 16, 48, 0, -2, 0, 16 },
+    { 0, 16, 6, -2, -2, -6, -2, 0, 48, 16, 6, 16 },
+    { 16, 0, -2, 6, -6, -2, 0, -2, 16, 48, 16, 6 },
+    { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 },
+    { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagrams.h
new file mode 100644
index 0000000000..8e0fc00307
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/diagrams.h
@@ -0,0 +1,1132 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 36 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 36 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 36 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 36 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 36 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 36 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 36 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 36 ***
+    // Wavefunction(s) for diagram number 8
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 36 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 36 ***
+    // Wavefunction(s) for diagram number 10
+    // (none)
+    // Amplitude(s) for diagram number 10
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 36 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 36 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 36 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 36 ***
+    // Wavefunction(s) for diagram number 14
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 36 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 36 ***
+    // Wavefunction(s) for diagram number 16
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 36 ***
+    // Wavefunction(s) for diagram number 17
+    // (none)
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 36 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 36 ***
+    // Wavefunction(s) for diagram number 19
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 36 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 36 ***
+    // Wavefunction(s) for diagram number 21
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 36 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 36 ***
+    // Wavefunction(s) for diagram number 23
+    // (none)
+    // Amplitude(s) for diagram number 23
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 36 ***
+    // Wavefunction(s) for diagram number 24
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 36 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 36 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 36 ***
+    // Wavefunction(s) for diagram number 27
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 36 ***
+    // Wavefunction(s) for diagram number 28
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 28
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 36 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 36 ***
+    // Wavefunction(s) for diagram number 30
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 30
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 36 ***
+    // Wavefunction(s) for diagram number 31
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 36 ***
+    // Wavefunction(s) for diagram number 32
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 36 ***
+    // Wavefunction(s) for diagram number 33
+    // (none)
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 36 ***
+    // Wavefunction(s) for diagram number 34
+    // (none)
+    // Amplitude(s) for diagram number 34
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 36 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 36 ***
+    // Wavefunction(s) for diagram number 36
+    // (none)
+    // Amplitude(s) for diagram number 36
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
index 0079f40417..616ba2e46c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,111 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  1),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,0,32,-4,0,32,12,0,32,0,-4/
 C     1 T(1,3,2) T(5,6,4)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  2),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 13, 23) /48,32,0,0,-4,12,32,32,0,-4,0/
 C     1 T(1,3,4) T(5,6,2)
-      DATA (CF(I,  3),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 24, 33) /48,32,32,12,-4,0,12,-4,-12,-4/
 C     1 T(1,5,3,2) T(6,4)
-      DATA (CF(I,  4),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  4),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 34, 42) /48,12,32,0,-4,-4,12,-4,-12/
 C     1 T(1,5,3,4) T(6,2)
-      DATA (CF(I,  5),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
-      DATA (CF(I,  5),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA (CF(I),I= 43, 50) /48,32,0,32,-4,-12,-4,12/
 C     1 T(1,5,6,2) T(3,4)
-      DATA (CF(I,  6),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
-      DATA (CF(I,  6),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 51, 57) /48,32,0,-12,-4,12,-4/
 C     1 T(1,5,6,4) T(3,2)
-      DATA (CF(I,  7),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 58, 63) /48,32,-4,0,32,0/
 C     1 T(1,6,2) T(5,3,4)
-      DATA (CF(I,  8),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,
-     $ -6.666666666666666D-01,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 64, 68) /48,0,-4,0,32/
 C     1 T(1,6,4) T(5,3,2)
-      DATA (CF(I,  9),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
-      DATA (CF(I,  9),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 69, 72) /48,32,12,32/
 C     1 T(3,2) T(5,1,6,4)
-      DATA (CF(I, 10),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I, 10),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 73, 75) /48,32,12/
 C     1 T(3,4) T(5,1,6,2)
-      DATA (CF(I, 11),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
-      DATA (CF(I, 11),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(5,1,3,2) T(6,4)
-      DATA (CF(I, 12),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I, 12),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(5,1,3,4) T(6,2)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -761,10 +695,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -773,6 +709,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
index acf1b836af..afd9438ba0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,804 +282,205 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 36 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 36 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 36 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 36 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 36 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 36 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 36 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 36 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 36 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= 1. / 6. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 36 ***
-
-      // Wavefunction(s) for diagram number 10
-      // (none)
-
-      // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 36 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 36 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 36 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 36 ***
-
-      // Wavefunction(s) for diagram number 14
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] -= 1. / 6. * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 15 OF 36 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 16 OF 36 ***
-
-      // Wavefunction(s) for diagram number 16
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 17 OF 36 ***
-
-      // Wavefunction(s) for diagram number 17
-      // (none)
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 18 OF 36 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 19 OF 36 ***
-
-      // Wavefunction(s) for diagram number 19
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 36 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 21 OF 36 ***
-
-      // Wavefunction(s) for diagram number 21
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 36 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 36 ***
-
-      // Wavefunction(s) for diagram number 23
-      // (none)
-
-      // Amplitude(s) for diagram number 23
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 24 OF 36 ***
-
-      // Wavefunction(s) for diagram number 24
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 36 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 36 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 36 ***
-
-      // Wavefunction(s) for diagram number 27
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 36 ***
-
-      // Wavefunction(s) for diagram number 28
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 36 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 30 OF 36 ***
-
-      // Wavefunction(s) for diagram number 30
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= 1. / 6. * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 36 ***
-
-      // Wavefunction(s) for diagram number 31
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 32 OF 36 ***
-
-      // Wavefunction(s) for diagram number 32
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= 1. / 6. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 36 ***
-
-      // Wavefunction(s) for diagram number 33
-      // (none)
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 36 ***
-
-      // Wavefunction(s) for diagram number 34
-      // (none)
-
-      // Amplitude(s) for diagram number 34
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 36 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 36 ***
-
-      // Wavefunction(s) for diagram number 36
-      // (none)
-
-      // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_gux_ttxgux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 },
-        { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 },
-        { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 },
-        { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 },
-        { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 36 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -1113,7 +567,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +606,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +652,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1292,26 +758,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +785,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +831,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +841,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1390,17 +872,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,35 +925,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1473,22 +1143,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,17 +1164,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1536,93 +1203,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1301,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1324,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,25 +1333,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1724,8 +1367,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1386,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1493,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
index 531d6bcd03..3324b8da0f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 38;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
index f13f023e7d..887e00cba5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
index a59705bfaf..5de24d634f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,14 +140,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -234,7 +234,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -309,6 +309,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -393,18 +397,18 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -528,6 +532,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc
new file mode 100644
index 0000000000..0bd6c47075
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.cc
@@ -0,0 +1,393 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, 0, 16, -2, 0 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, 16, 0, 0, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 0, 0, 16 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 0, -2, 16, 0 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 6, -2, -2, -6 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 6, -6, -2 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, -2, -6, 6, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, -6, -2, -2, 6 },
+    { 0, 16, -2, 0, 6, -2, -2, -6, 48, 16, 16, 6 },
+    { 16, 0, 0, -2, -2, 6, -6, -2, 16, 48, 6, 16 },
+    { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 },
+    { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagrams.h
new file mode 100644
index 0000000000..513029c15f
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/diagrams.h
@@ -0,0 +1,1132 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 36 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 36 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 36 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 36 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 36 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 36 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 36 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 36 ***
+    // Wavefunction(s) for diagram number 8
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 36 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 36 ***
+    // Wavefunction(s) for diagram number 10
+    // (none)
+    // Amplitude(s) for diagram number 10
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 36 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 36 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 36 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 36 ***
+    // Wavefunction(s) for diagram number 14
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 36 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 36 ***
+    // Wavefunction(s) for diagram number 16
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 36 ***
+    // Wavefunction(s) for diagram number 17
+    // (none)
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 36 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 36 ***
+    // Wavefunction(s) for diagram number 19
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 36 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 36 ***
+    // Wavefunction(s) for diagram number 21
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 36 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 36 ***
+    // Wavefunction(s) for diagram number 23
+    // (none)
+    // Amplitude(s) for diagram number 23
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 36 ***
+    // Wavefunction(s) for diagram number 24
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 36 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 36 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 36 ***
+    // Wavefunction(s) for diagram number 27
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 36 ***
+    // Wavefunction(s) for diagram number 28
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 28
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 36 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 36 ***
+    // Wavefunction(s) for diagram number 30
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 30
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 36 ***
+    // Wavefunction(s) for diagram number 31
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 36 ***
+    // Wavefunction(s) for diagram number 32
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 36 ***
+    // Wavefunction(s) for diagram number 33
+    // (none)
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 36 ***
+    // Wavefunction(s) for diagram number 34
+    // (none)
+    // Amplitude(s) for diagram number 34
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 36 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 36 ***
+    // Wavefunction(s) for diagram number 36
+    // (none)
+    // Amplitude(s) for diagram number 36
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
index 7cd8b962cc..db9ee54bf0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,109 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,0.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  1),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,32,12,0,32,-4,0,0,32,-4,0/
 C     1 T(1,2,4) T(5,3,6)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,0.000000000000000D+00/
-      DATA (CF(I,  2),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,32,0,0,-4/
 C     1 T(1,2,6) T(5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,0,0,32/
 C     1 T(1,3,4) T(5,2,6)
-      DATA (CF(I,  4),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  4),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 34, 42) /48,0,-4,32,0,0,-4,32,0/
 C     1 T(1,3,6) T(5,2,4)
-      DATA (CF(I,  5),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 43, 50) /48,32,32,12,12,-4,-4,-12/
 C     1 T(1,5,2,4) T(3,6)
-      DATA (CF(I,  6),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01/
-      DATA (CF(I,  6),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 51, 57) /48,12,32,-4,12,-12,-4/
 C     1 T(1,5,2,6) T(3,4)
-      DATA (CF(I,  7),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 58, 63) /48,32,-4,-12,12,-4/
 C     1 T(1,5,3,4) T(2,6)
-      DATA (CF(I,  8),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA (CF(I),I= 64, 68) /48,-12,-4,-4,12/
 C     1 T(1,5,3,6) T(2,4)
-      DATA (CF(I,  9),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /-6.666666666666666D-01,
-     $ -2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(2,4) T(5,1,3,6)
-      DATA (CF(I, 10),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /-2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(2,6) T(5,1,3,4)
-      DATA (CF(I, 11),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,-2.000000000000000D+00/
-      DATA (CF(I, 11),I=  7, 12) /2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(3,4) T(5,1,2,6)
-      DATA (CF(I, 12),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,-2.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I, 12),I=  7, 12) /-6.666666666666666D-01
-     $ ,2.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(3,6) T(5,1,2,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -758,10 +694,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -770,6 +708,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
index d34888db6a..fcc3d8d8af 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,20 +103,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -173,57 +171,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -231,377 +284,147 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 7 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 7 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 7 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 7 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 7 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
 
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 7 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 7 ***
 
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uc_ttxuc()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 7 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -688,7 +511,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -723,6 +550,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -765,6 +596,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -867,26 +702,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -894,25 +729,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -925,7 +775,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -935,26 +785,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -965,17 +816,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1012,35 +869,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1048,22 +1087,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1075,17 +1108,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1111,93 +1147,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1239,7 +1245,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1262,7 +1268,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1271,25 +1277,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1299,8 +1311,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1316,11 +1330,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1422,14 +1437,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
index 08510dfc85..c26e439a36 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -80,17 +81,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 7;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -127,23 +128,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -157,34 +161,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
index bb9d2c55fb..6aea556b29 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
index b76b7c4456..d676a45fad 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         S2=PDG2PDF(LPP(IB(2)),3, IB(2),XBK(IB(2)), QSCALE)
@@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -555,6 +559,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc
new file mode 100644
index 0000000000..087618686d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagrams.h
new file mode 100644
index 0000000000..9f93aa2532
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/diagrams.h
@@ -0,0 +1,247 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 7 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 7 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 7 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 7 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 7 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 7 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 7 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
index bfe665d186..67afdb3cae 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -408,7 +408,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -451,39 +452,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(3,1) T(5,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(3,1) T(5,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(3,2) T(5,1) T(6,4)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(3,2) T(5,4) T(6,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(3,4) T(5,1) T(6,2)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(3,4) T(5,2) T(6,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -553,10 +547,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -565,6 +561,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
index 66e4b80f71..ee32f26811 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -107,20 +109,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -179,57 +177,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -237,377 +290,147 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 7 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 7 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 7 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 7 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 7 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
 
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 7 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 7 ***
 
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_ucx_ttxucx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 7 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -694,7 +517,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -729,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -771,6 +602,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -873,26 +708,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -900,25 +735,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -931,7 +781,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -941,26 +791,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -971,17 +822,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1018,35 +875,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1054,22 +1093,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1081,17 +1114,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1117,93 +1153,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1245,7 +1251,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1268,7 +1274,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1277,25 +1283,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1305,8 +1317,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1322,11 +1336,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1428,14 +1443,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
index 04b9f5bcb1..75c705a855 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -86,17 +87,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 7;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -133,23 +134,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -163,34 +167,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
index 5046df7e56..b693098acb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
index 848991a32a..87348bace0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -627,6 +631,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc
new file mode 100644
index 0000000000..087618686d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagrams.h
new file mode 100644
index 0000000000..3e70524053
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/diagrams.h
@@ -0,0 +1,247 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 7 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 7 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 7 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 7 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 7 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 7 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 7 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
index 5dcb5155f3..210248dac7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +366,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -420,7 +420,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -463,39 +464,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(2,1) T(3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(2,1) T(3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(2,4) T(3,1) T(5,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(2,4) T(3,6) T(5,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(2,6) T(3,1) T(5,4)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(2,6) T(3,4) T(5,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -565,10 +559,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -577,6 +573,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
index 8d266e82b7..50c33f72e9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,489 +282,161 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 14 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 14 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 14 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 14 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 14 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 14 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 14 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 14 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 14 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 14 ***
-
-      // Wavefunction(s) for diagram number 10
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 14 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
 
-      // *** DIAGRAM 12 OF 14 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 14 ***
-
-      // Wavefunction(s) for diagram number 13
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 14 ***
 
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uu_ttxuu()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 14 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -798,7 +523,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -833,6 +562,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -875,6 +608,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -977,26 +714,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1004,25 +741,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1035,7 +787,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1045,26 +797,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1075,17 +828,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1122,35 +881,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1160,20 +1101,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1185,17 +1120,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1221,93 +1159,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1349,7 +1257,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1372,7 +1280,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1381,25 +1289,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1409,8 +1323,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1426,11 +1342,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1532,14 +1449,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
index fd123d932d..8b71fbebc8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 14;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
index 77164138e6..87bbc98a81 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
index f03c7f3b0c..8712e90238 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         D2=PDG2PDF(LPP(IB(2)),1, IB(2),XBK(IB(2)), QSCALE)
         U2=PDG2PDF(LPP(IB(2)),2, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             D2(IVEC)=PDG2PDF(LPP(IB(2)),1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             U2(IVEC)=PDG2PDF(LPP(IB(2)),2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             S2(IVEC)=PDG2PDF(LPP(IB(2)),3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             C2(IVEC)=PDG2PDF(LPP(IB(2)),4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc
new file mode 100644
index 0000000000..087618686d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagrams.h
new file mode 100644
index 0000000000..9f38cec61a
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/diagrams.h
@@ -0,0 +1,471 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 14 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], +1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 14 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 14 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 14 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 14 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 14 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 14 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 14 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 14 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 14 ***
+    // Wavefunction(s) for diagram number 10
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 14 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 14 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[5], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 14 ***
+    // Wavefunction(s) for diagram number 13
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 14 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
index 8b80833180..ca1ea52d2b 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(16)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,39 +448,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(3,1) T(5,2) T(6,4)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(3,1) T(5,4) T(6,2)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(3,2) T(5,1) T(6,4)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(3,2) T(5,4) T(6,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(3,4) T(5,1) T(6,2)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(3,4) T(5,2) T(6,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -585,10 +579,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -597,6 +593,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
index 1b918bae84..a00dd1fdde 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -107,20 +109,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -179,57 +177,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -237,377 +290,147 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 7 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 7 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 7 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 7 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 7 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
 
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 7 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 7 ***
 
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uux_ttxccx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 7 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -694,7 +517,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -729,6 +556,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -771,6 +602,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -873,26 +708,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -900,25 +735,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -931,7 +781,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -941,26 +791,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -971,17 +822,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1018,35 +875,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1054,22 +1093,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1081,17 +1114,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1117,93 +1153,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1245,7 +1251,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1268,7 +1274,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1277,25 +1283,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1305,8 +1317,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1322,11 +1336,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1428,14 +1443,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
index 87faf25dfb..17302e0d54 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -86,17 +87,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 7;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -133,23 +134,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -163,34 +167,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
index e3f26606a1..41ac73e027 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
index 74f009d272..92abff14ad 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -148,7 +148,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -158,7 +158,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -269,7 +269,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -353,6 +353,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -438,24 +442,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -627,6 +631,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc
new file mode 100644
index 0000000000..087618686d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagrams.h
new file mode 100644
index 0000000000..57e2446ba9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/diagrams.h
@@ -0,0 +1,247 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 7 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 7 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 7 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 7 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 7 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 7 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 7 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
index 728711155f..6dffcf0951 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -366,7 +366,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -420,7 +420,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -463,39 +464,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(2,1) T(3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(2,1) T(3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(2,4) T(3,1) T(5,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(2,4) T(3,6) T(5,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(2,6) T(3,1) T(5,4)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(2,6) T(3,4) T(5,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -565,10 +559,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -577,6 +573,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
index 1c575b7757..2863f773a3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,804 +282,205 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 36 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 36 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 36 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 36 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 36 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 36 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 36 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[10] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 36 ***
-
-      // Wavefunction(s) for diagram number 8
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 36 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 36 ***
-
-      // Wavefunction(s) for diagram number 10
-      // (none)
-
-      // Amplitude(s) for diagram number 10
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 36 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 36 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 36 ***
-
-      // Wavefunction(s) for diagram number 13
-      // (none)
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-      jamp_sv[11] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 36 ***
-
-      // Wavefunction(s) for diagram number 14
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 15 OF 36 ***
-
-      // Wavefunction(s) for diagram number 15
-      // (none)
-
-      // Amplitude(s) for diagram number 15
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 16 OF 36 ***
-
-      // Wavefunction(s) for diagram number 16
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += 1. / 6. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 17 OF 36 ***
-
-      // Wavefunction(s) for diagram number 17
-      // (none)
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 18 OF 36 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 19 OF 36 ***
-
-      // Wavefunction(s) for diagram number 19
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 36 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 21 OF 36 ***
-
-      // Wavefunction(s) for diagram number 21
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 36 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] += 1. / 6. * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 36 ***
-
-      // Wavefunction(s) for diagram number 23
-      // (none)
-
-      // Amplitude(s) for diagram number 23
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += 1. / 6. * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 24 OF 36 ***
-
-      // Wavefunction(s) for diagram number 24
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 36 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 36 ***
-
-      // Wavefunction(s) for diagram number 26
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 36 ***
-
-      // Wavefunction(s) for diagram number 27
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 36 ***
-
-      // Wavefunction(s) for diagram number 28
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 28
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 36 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 30 OF 36 ***
-
-      // Wavefunction(s) for diagram number 30
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 30
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 36 ***
-
-      // Wavefunction(s) for diagram number 31
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 32 OF 36 ***
-
-      // Wavefunction(s) for diagram number 32
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 36 ***
-
-      // Wavefunction(s) for diagram number 33
-      // (none)
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 36 ***
-
-      // Wavefunction(s) for diagram number 34
-      // (none)
-
-      // Amplitude(s) for diagram number 34
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 36 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 36 ***
-
-      // Wavefunction(s) for diagram number 36
-      // (none)
-
-      // Amplitude(s) for diagram number 36
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uux_ttxgg()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 },
-        { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 },
-        { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 },
-        { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 },
-        { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 },
-        { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 },
-        { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 },
-        { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 },
-        { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 },
-        { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 },
-        { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 },
-        { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 36 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -1113,7 +567,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1148,6 +606,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1190,6 +652,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1292,26 +758,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1319,25 +785,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1350,7 +831,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1360,26 +841,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1390,17 +872,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1437,35 +925,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1473,22 +1143,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1500,17 +1164,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1536,93 +1203,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1664,7 +1301,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1687,7 +1324,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1696,25 +1333,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1724,8 +1367,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1741,11 +1386,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1847,14 +1493,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
index 0689624568..8906bee944 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 36; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 15; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 38;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
index 5787ba42b2..9f0a834688 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
index 75d947b792..02fad5c3ba 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc
new file mode 100644
index 0000000000..ffbf0d5f94
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.cc
@@ -0,0 +1,393 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, -6, 16, -2, 16, -2, 6, 6, 0, -2, 16, 0 },
+    { -6, 48, -2, 16, -2, 16, 6, 6, 0, 16, -2, 0 },
+    { 16, -2, 48, -6, 6, 6, 16, -2, -2, 0, 0, 16 },
+    { -2, 16, -6, 48, 6, 6, -2, 16, 16, 0, 0, -2 },
+    { 16, -2, 6, 6, 48, -6, 16, -2, 16, 0, 0, -2 },
+    { -2, 16, 6, 6, -6, 48, -2, 16, -2, 0, 0, 16 },
+    { 6, 6, 16, -2, 16, -2, 48, -6, 0, 16, -2, 0 },
+    { 6, 6, -2, 16, -2, 16, -6, 48, 0, -2, 16, 0 },
+    { 0, 0, -2, 16, 16, -2, 0, 0, 48, 16, 16, 6 },
+    { -2, 16, 0, 0, 0, 0, 16, -2, 16, 48, 6, 16 },
+    { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 },
+    { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagrams.h
new file mode 100644
index 0000000000..0dd99001f6
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/diagrams.h
@@ -0,0 +1,1132 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 36 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], +1, w_fp[5], 5 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 36 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 36 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 36 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 36 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 36 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 36 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 36 ***
+    // Wavefunction(s) for diagram number 8
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 36 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 36 ***
+    // Wavefunction(s) for diagram number 10
+    // (none)
+    // Amplitude(s) for diagram number 10
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 36 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[11], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 36 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 36 ***
+    // Wavefunction(s) for diagram number 13
+    // (none)
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 36 ***
+    // Wavefunction(s) for diagram number 14
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[11], w_fp[5], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 36 ***
+    // Wavefunction(s) for diagram number 15
+    // (none)
+    // Amplitude(s) for diagram number 15
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 36 ***
+    // Wavefunction(s) for diagram number 16
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 36 ***
+    // Wavefunction(s) for diagram number 17
+    // (none)
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 36 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[9], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 36 ***
+    // Wavefunction(s) for diagram number 19
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[9], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 36 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 36 ***
+    // Wavefunction(s) for diagram number 21
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 36 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 36 ***
+    // Wavefunction(s) for diagram number 23
+    // (none)
+    // Amplitude(s) for diagram number 23
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[10], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 36 ***
+    // Wavefunction(s) for diagram number 24
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[5], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 36 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 36 ***
+    // Wavefunction(s) for diagram number 26
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[11] );
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[11], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 36 ***
+    // Wavefunction(s) for diagram number 27
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[11] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 36 ***
+    // Wavefunction(s) for diagram number 28
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 28
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 36 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 36 ***
+    // Wavefunction(s) for diagram number 30
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 30
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 36 ***
+    // Wavefunction(s) for diagram number 31
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 36 ***
+    // Wavefunction(s) for diagram number 32
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 36 ***
+    // Wavefunction(s) for diagram number 33
+    // (none)
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 36 ***
+    // Wavefunction(s) for diagram number 34
+    // (none)
+    // Amplitude(s) for diagram number 34
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    VVVV3_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV4_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[5], w_fp[7], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 36 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[6], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 36 ***
+    // Wavefunction(s) for diagram number 36
+    // (none)
+    // Amplitude(s) for diagram number 36
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[7], w_fp[11], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
index 65c377ffc0..563e5bc0a0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(17)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,111 +448,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01,
-     $ -2.000000000000000D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  1),I=  7, 12) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,-12,32,-4,32,-4,12,12,0,-4,32,0/
 C     1 T(2,1) T(5,6,3,4)
-      DATA (CF(I,  2),I=  1,  6) /-2.000000000000000D+00
-     $ ,1.600000000000000D+01,-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
-      DATA (CF(I,  2),I=  7, 12) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 13, 23) /48,-4,32,-4,32,12,12,0,32,-4,0/
 C     1 T(2,1) T(6,5,3,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,1.600000000000000D+01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,2.000000000000000D
-     $ +00/
-      DATA (CF(I,  3),I=  7, 12) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 24, 33) /48,-12,12,12,32,-4,-4,0,0,32/
 C     1 T(2,4) T(5,6,3,1)
-      DATA (CF(I,  4),I=  1,  6) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-2.000000000000000D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,2.000000000000000D
-     $ +00/
-      DATA (CF(I,  4),I=  7, 12) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 34, 42) /48,12,12,-4,32,32,0,0,-4/
 C     1 T(2,4) T(6,5,3,1)
-      DATA (CF(I,  5),I=  1,  6) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,2.000000000000000D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,-2.000000000000000D+00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 43, 50) /48,-12,32,-4,32,0,0,-4/
 C     1 T(3,1) T(5,6,2,4)
-      DATA (CF(I,  6),I=  1,  6) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,2.000000000000000D+00,2.000000000000000D
-     $ +00,-2.000000000000000D+00,1.600000000000000D+01/
-      DATA (CF(I,  6),I=  7, 12) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 51, 57) /48,-4,32,-4,0,0,32/
 C     1 T(3,1) T(6,5,2,4)
-      DATA (CF(I,  7),I=  1,  6) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01,
-     $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 58, 63) /48,-12,0,32,-4,0/
 C     1 T(3,4) T(5,6,2,1)
-      DATA (CF(I,  8),I=  1,  6) /2.000000000000000D+00
-     $ ,2.000000000000000D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /-2.000000000000000D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 64, 68) /48,0,-4,32,0/
 C     1 T(3,4) T(6,5,2,1)
-      DATA (CF(I,  9),I=  1,  6) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,-6.666666666666666D-01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(5,2,1) T(6,3,4)
-      DATA (CF(I, 10),I=  1,  6) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,0.000000000000000D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,0.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(5,2,4) T(6,3,1)
-      DATA (CF(I, 11),I=  1,  6) /5.333333333333333D+00,
-     $ -6.666666666666666D-01,0.000000000000000D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,0.000000000000000D+00/
-      DATA (CF(I, 11),I=  7, 12) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(5,3,1) T(6,2,4)
-      DATA (CF(I, 12),I=  1,  6) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,5.333333333333333D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
-      DATA (CF(I, 12),I=  7, 12) /0.000000000000000D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(5,3,4) T(6,2,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -761,10 +695,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -773,6 +709,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
index e6d6423d5e..e0b9996ffc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,489 +282,161 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 14 ***
-
-      // Wavefunction(s) for diagram number 1
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 14 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 14 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 14 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 14 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 14 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 14 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 14 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 14 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 14 ***
-
-      // Wavefunction(s) for diagram number 10
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 14 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
 
-      // *** DIAGRAM 12 OF 14 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 14 ***
-
-      // Wavefunction(s) for diagram number 13
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 14 ***
 
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 36. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uux_ttxuux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 14 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -798,7 +523,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -833,6 +562,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -875,6 +608,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -977,26 +714,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1004,25 +741,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1035,7 +787,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1045,26 +797,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1075,17 +828,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1122,35 +881,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1158,22 +1099,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1185,17 +1120,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1221,93 +1159,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1349,7 +1257,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1372,7 +1280,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1381,25 +1289,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1409,8 +1323,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1426,11 +1342,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1532,14 +1449,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
index de4fd12c37..515a957ce5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 14;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
index 639c7207e3..806033a9ec 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
index 8fc5eeb386..e841acfd24 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         D1=PDG2PDF(LPP(IB(1)),1, IB(1),XBK(IB(1)), QSCALE)
         U1=PDG2PDF(LPP(IB(1)),2, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             D1(IVEC)=PDG2PDF(LPP(IB(1)),1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             U1(IVEC)=PDG2PDF(LPP(IB(1)),2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             S1(IVEC)=PDG2PDF(LPP(IB(1)),3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             C1(IVEC)=PDG2PDF(LPP(IB(1)),4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc
new file mode 100644
index 0000000000..087618686d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagrams.h
new file mode 100644
index 0000000000..7419b50278
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/diagrams.h
@@ -0,0 +1,471 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 14 ***
+    // Wavefunction(s) for diagram number 1
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], +1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 14 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 14 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 14 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 14 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 14 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 14 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 14 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 14 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 14 ***
+    // Wavefunction(s) for diagram number 10
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 14 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 14 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[4], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 14 ***
+    // Wavefunction(s) for diagram number 13
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 14 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
index 9a6d844439..e4cc5c2814 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(16)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,39 +448,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(2,1) T(3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(2,1) T(3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(2,4) T(3,1) T(5,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(2,4) T(3,6) T(5,1)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(2,6) T(3,1) T(5,4)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(2,6) T(3,4) T(5,1)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -585,10 +579,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -597,6 +593,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
index bf560d981f..2cd230128e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -101,20 +103,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -173,57 +171,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -231,377 +284,147 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 7 ***
-
-      // Wavefunction(s) for diagram number 1
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 7 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 7 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 7 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 7 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
 
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 7 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 7 ***
 
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uxcx_ttxuxcx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 7 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -688,7 +511,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -723,6 +550,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -765,6 +596,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -867,26 +702,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -894,25 +729,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -925,7 +775,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -935,26 +785,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -965,17 +816,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1012,35 +869,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1048,22 +1087,16 @@ namespace mg5amcCpu
     // These variable are not used anywhere else in the code and their scope is limited to this sanity check
     {
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
-      constexpr int nprocesses = 2;
+      constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1075,17 +1108,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1111,93 +1147,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1239,7 +1245,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1262,7 +1268,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1271,25 +1277,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1299,8 +1311,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1316,11 +1330,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1422,14 +1437,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
index 13a02cdb83..e99911c34f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -80,17 +81,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 7; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 9; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 7;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -127,23 +128,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -157,34 +161,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
index bf9951e502..a781041f7d 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
index 24b0abb30c..d25e751436 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -142,7 +142,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE)
         UX1=PDG2PDF(LPP(IB(1)),-2, IB(1),XBK(IB(1)), QSCALE)
@@ -151,7 +151,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -243,7 +243,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -321,6 +321,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -406,20 +410,20 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -555,6 +559,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc
new file mode 100644
index 0000000000..087618686d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagrams.h
new file mode 100644
index 0000000000..0c601d8e61
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/diagrams.h
@@ -0,0 +1,247 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 7 ***
+    // Wavefunction(s) for diagram number 1
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 7 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 7 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 7 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 7 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[3], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 7 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 7 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[3] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
index 2a76dfeffb..14d46077f0 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -360,7 +360,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -408,7 +408,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(8)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -451,39 +452,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(1,4) T(2,5) T(3,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(1,4) T(2,6) T(3,5)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(1,5) T(2,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(1,5) T(2,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(1,6) T(2,4) T(3,5)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(1,6) T(2,5) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -553,10 +547,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -565,6 +561,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
index 83faf9192b..7fe0dd7a98 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_sm.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -99,20 +101,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_sm_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_sm_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 6;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -171,57 +169,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -229,489 +282,161 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 14 ***
-
-      // Wavefunction(s) for diagram number 1
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
-
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 14 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 14 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 3
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 14 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 14 ***
-
-      // Wavefunction(s) for diagram number 5
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[2] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 14 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-
-      // Amplitude(s) for diagram number 6
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 14 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 14 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 14 ***
-
-      // Wavefunction(s) for diagram number 9
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 14 ***
-
-      // Wavefunction(s) for diagram number 10
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 12. * amp_sv[0];
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 14 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[2] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
 
-      // *** DIAGRAM 12 OF 14 ***
-
-      // Wavefunction(s) for diagram number 12
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[1] -= 1. / 4. * amp_sv[0];
-      jamp_sv[3] += 1. / 12. * amp_sv[0];
-      jamp_sv[4] += 1. / 12. * amp_sv[0];
-      jamp_sv[5] -= 1. / 36. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 14 ***
-
-      // Wavefunction(s) for diagram number 13
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] );
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] += 1. / 4. * amp_sv[0];
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 14 ***
 
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 12. * amp_sv[0];
-      jamp_sv[3] += 1. / 36. * amp_sv[0];
-      jamp_sv[4] += 1. / 4. * amp_sv[0];
-      jamp_sv[5] -= 1. / 12. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_2_uxux_ttxuxux()?)
-
-      // The color denominators (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
-
-      // The color matrix (initialize all array elements, with ncolor=6)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 27, 9, 9, 3, 3, 9 },
-        { 9, 27, 3, 9, 9, 3 },
-        { 9, 3, 27, 9, 9, 3 },
-        { 3, 9, 9, 27, 3, 9 },
-        { 3, 9, 9, 3, 27, 9 },
-        { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 14 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -798,7 +523,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -833,6 +562,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -875,6 +608,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_sm::mdl_MT );
     m_masses.push_back( Parameters_sm::ZERO );
     m_masses.push_back( Parameters_sm::ZERO );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -977,26 +714,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1004,25 +741,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1035,7 +787,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1045,26 +797,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1075,17 +828,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1122,35 +881,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1160,20 +1101,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1185,17 +1120,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -1221,93 +1159,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1349,7 +1257,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1372,7 +1280,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1381,25 +1289,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1409,8 +1323,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1426,11 +1342,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1532,14 +1449,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
index 0b67fca178..ee62f5cc48 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_sm.h"
 
 #include <vector>
@@ -78,17 +79,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 14; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 6; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 11; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 14;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -125,23 +126,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -155,34 +159,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
index f8d2319067..dcc832fcc1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
index f9adb0c2a2..4bef7f631c 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -140,7 +140,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         CX1=PDG2PDF(LPP(IB(1)),-4, IB(1),XBK(IB(1)), QSCALE)
         SX1=PDG2PDF(LPP(IB(1)),-3, IB(1),XBK(IB(1)), QSCALE)
@@ -150,7 +150,7 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         CX2=PDG2PDF(LPP(IB(2)),-4, IB(2),XBK(IB(2)), QSCALE)
         SX2=PDG2PDF(LPP(IB(2)),-3, IB(2),XBK(IB(2)), QSCALE)
@@ -237,7 +237,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -313,6 +313,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -398,24 +402,24 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             CX1(IVEC)=PDG2PDF(LPP(IB(1)),-4, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             SX1(IVEC)=PDG2PDF(LPP(IB(1)),-3, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             UX1(IVEC)=PDG2PDF(LPP(IB(1)),-2, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
             DX1(IVEC)=PDG2PDF(LPP(IB(1)),-1, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             CX2(IVEC)=PDG2PDF(LPP(IB(2)),-4, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             SX2(IVEC)=PDG2PDF(LPP(IB(2)),-3, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             UX2(IVEC)=PDG2PDF(LPP(IB(2)),-2, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
             DX2(IVEC)=PDG2PDF(LPP(IB(2)),-1, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -539,6 +543,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc
new file mode 100644
index 0000000000..087618686d
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.cc
@@ -0,0 +1,387 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 1, 1, 1, 1, 1, 1 }; // 1-D array[6]
+
+  // The color matrix (initialize all array elements, with ncolor=6)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 27, 9, 9, 3, 3, 9 },
+    { 9, 27, 3, 9, 9, 3 },
+    { 9, 3, 27, 9, 9, 3 },
+    { 3, 9, 9, 27, 3, 9 },
+    { 3, 9, 9, 3, 27, 9 },
+    { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagrams.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagrams.h
new file mode 100644
index 0000000000..146b88ee10
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/diagrams.h
@@ -0,0 +1,471 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 14 ***
+    // Wavefunction(s) for diagram number 1
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][4], -1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][5], -1, w_fp[5], 5 );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 14 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[8], w_fp[2], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 14 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 3
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 14 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 14 ***
+    // Wavefunction(s) for diagram number 5
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[1], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 14 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    // Amplitude(s) for diagram number 6
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[10], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 14 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[10] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[2], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 14 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    VVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[6], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 14 ***
+    // Wavefunction(s) for diagram number 9
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 14 ***
+    // Wavefunction(s) for diagram number 10
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[9], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[10], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 14 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 14 ***
+    // Wavefunction(s) for diagram number 12
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[8], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[1], w_fp[6], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 36. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 14 ***
+    // Wavefunction(s) for diagram number 13
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[7], COUPs[1], 1.0, 0., 0., w_fp[6] );
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 14 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[10], w_fp[0], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 12. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 36. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 4. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 12. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
index 35761964e7..a4ee136e47 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -358,7 +358,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -404,7 +404,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(16)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -447,39 +448,32 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /2.700000000000000D+01
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,3.000000000000000D+00,9.000000000000000D+00/
+      DATA DENOM/1/
+      DATA (CF(I),I=  1,  6) /27,18,18,6,6,18/
 C     1 T(1,4) T(2,5) T(3,6)
-      DATA (CF(I,  2),I=  1,  6) /9.000000000000000D+00
-     $ ,2.700000000000000D+01,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I=  7, 11) /27,6,18,18,6/
 C     1 T(1,4) T(2,6) T(3,5)
-      DATA (CF(I,  3),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,2.700000000000000D+01,9.000000000000000D
-     $ +00,9.000000000000000D+00,3.000000000000000D+00/
+      DATA (CF(I),I= 12, 15) /27,18,18,6/
 C     1 T(1,5) T(2,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,2.700000000000000D
-     $ +01,3.000000000000000D+00,9.000000000000000D+00/
+      DATA (CF(I),I= 16, 18) /27,6,18/
 C     1 T(1,5) T(2,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /3.000000000000000D+00
-     $ ,9.000000000000000D+00,9.000000000000000D+00,3.000000000000000D
-     $ +00,2.700000000000000D+01,9.000000000000000D+00/
+      DATA (CF(I),I= 19, 20) /27,18/
 C     1 T(1,6) T(2,4) T(3,5)
-      DATA (CF(I,  6),I=  1,  6) /9.000000000000000D+00
-     $ ,3.000000000000000D+00,3.000000000000000D+00,9.000000000000000D
-     $ +00,9.000000000000000D+00,2.700000000000000D+01/
+      DATA (CF(I),I= 21, 21) /27/
 C     1 T(1,6) T(2,5) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -585,10 +579,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -597,6 +593,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py
index 57a85b0614..33a89259f8 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/ufomodel/write_param_card.py
@@ -116,9 +116,10 @@ def write_param(self, param, lhablock):
     def write_dep_param_block(self, lhablock):
         import cmath
         from parameters import all_parameters
+        param_values = {'cmath':cmath}
         for parameter in all_parameters:
             try:
-                exec("%s = %s" % (parameter.name, parameter.value))
+                exec("%s = %s" % (parameter.name, parameter.value), globals(), param_values)
             except Exception:
                 pass
         text = "##  Not dependent paramater.\n"
@@ -134,7 +135,7 @@ def write_dep_param_block(self, lhablock):
             prefix = "DECAY "
         for part, param in data:
             if isinstance(param.value, str):
-                value = complex(eval(param.value)).real
+                value = complex(eval(param.value, globals(), param_values)).real
             else:
                 value = param.value
             
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/madevent b/epochX/cudacpp/pp_tt012j.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/madevent
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
index 53dd560ed6..c30f753dcb 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -963,7 +963,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -976,7 +976,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -990,7 +990,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1003,7 +1003,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1017,7 +1017,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1_0( const fptype allV1[],
           const fptype allV2[],
@@ -1030,7 +1030,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1053,7 +1053,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1066,7 +1066,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1091,7 +1091,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1104,7 +1104,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1116,7 +1116,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1129,7 +1129,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1148,7 +1148,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1161,7 +1161,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1180,7 +1180,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1193,7 +1193,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1211,7 +1211,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1226,7 +1226,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
@@ -1241,7 +1241,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1256,7 +1256,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1276,7 +1276,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3_0( const fptype allV1[],
            const fptype allV2[],
@@ -1291,7 +1291,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1306,7 +1306,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV3P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1321,7 +1321,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
@@ -1341,7 +1341,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4_0( const fptype allV1[],
            const fptype allV2[],
@@ -1356,7 +1356,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1371,7 +1371,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV4P0_1( const fptype allV2[],
              const fptype allV3[],
@@ -1386,7 +1386,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     V1[0] = +V2[0] + V3[0] + V4[0];
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
index 47a3a011b8..fd5642f3e3 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
index 76066c7bb1..f4b086fc96 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -310,7 +310,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -320,12 +320,12 @@ namespace mg5amcCpu
     using namespace Parameters_sm_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
-    fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
-    fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
-    cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s );
-    cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s );
-    cxtype_sv_ref GC_12s_sv = C_ACCESS::kernelAccess( GC_12s );
+    fptype* GC_10s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 );
+    fptype* GC_11s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 );
+    fptype* GC_12s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 );
+    cxtype_sv_ref GC_10s_sv = CD_ACCESS::kernelAccess( GC_10s );
+    cxtype_sv_ref GC_11s_sv = CD_ACCESS::kernelAccess( GC_11s );
+    cxtype_sv_ref GC_12s_sv = CD_ACCESS::kernelAccess( GC_12s );
     GC_10s_sv = couplings_sv.GC_10;
     GC_11s_sv = couplings_sv.GC_11;
     GC_12s_sv = couplings_sv.GC_12;
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
index 85f434b58f..2ebaacfa09 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/CODEGEN_mad_smeft_gg_tttt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
 INFO: load vertices 
@@ -73,7 +73,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.12868547439575195 [0m
+[1;32mDEBUG: model prefixing  takes 0.1283280849456787 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -88,21 +88,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 3.699 s
+1 processes with 72 diagrams generated in 3.707 s
 Total: 1 processes with 72 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_smeft_gg_tttt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_smeft_gg_tttt 
 INFO: remove old information in CODEGEN_mad_smeft_gg_tttt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
@@ -114,25 +114,25 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1552][0m [0m
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.189 s
-Wrote files for 119 helas calls in 0.388 s
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 70 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 67: 68, 68: 69, 69: 71, 70: 72} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56, 57: 57, 58: 58, 59: 59, 60: 60, 61: 61, 62: 62, 63: 63, 64: 64, 65: 65, 66: 66, 68: 67, 69: 68, 71: 69, 72: 70} [1;30m[model_handling.py at line 1665][0m [0m
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.185 s
+Wrote files for 119 helas calls in 0.383 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.319 s
+ALOHA: aloha creates 5 routines in  0.309 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 10 routines in  0.341 s
+ALOHA: aloha creates 10 routines in  0.324 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -142,37 +142,37 @@ ALOHA: aloha creates 10 routines in  0.341 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses/P1_gg_ttxttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/SubProcesses/P1_gg_ttxttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 275 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m7.169s
-user	0m6.853s
-sys	0m0.298s
+real	0m7.149s
+user	0m6.831s
+sys	0m0.290s
 Code generation completed in 7 seconds
 ************************************************************
 *                                                          *
@@ -186,7 +186,7 @@ Code generation completed in 7 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -194,9 +194,9 @@ Code generation completed in 7 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -216,7 +216,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -224,9 +224,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_smeft_gg_tttt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
index 9bcf8cac8c..1e922c1025 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h
index a4f3a481bb..84c20a1f30 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
index 2f711d8cc1..7f3a4e3dca 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
index 96d77e5403..42eaa96778 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,1333 +279,277 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 72 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 );
-
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 72 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 72 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 72 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 72 ***
-
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 72 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 6
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 72 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 72 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 72 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 72 ***
-
-      // Wavefunction(s) for diagram number 10
-      // (none)
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 72 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 72 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 72 ***
-
-      // Wavefunction(s) for diagram number 13
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 72 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 15 OF 72 ***
-
-      // Wavefunction(s) for diagram number 15
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
-
-      // Amplitude(s) for diagram number 15
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 16 OF 72 ***
-
-      // Wavefunction(s) for diagram number 16
-      // (none)
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 17 OF 72 ***
-
-      // Wavefunction(s) for diagram number 17
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 18 OF 72 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 19 OF 72 ***
-
-      // Wavefunction(s) for diagram number 19
-      // (none)
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 72 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 21 OF 72 ***
-
-      // Wavefunction(s) for diagram number 21
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 72 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 72 ***
-
-      // Wavefunction(s) for diagram number 23
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 23
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 24 OF 72 ***
-
-      // Wavefunction(s) for diagram number 24
-      // (none)
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 72 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 72 ***
-
-      // Wavefunction(s) for diagram number 26
-      // (none)
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 72 ***
-
-      // Wavefunction(s) for diagram number 27
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 72 ***
-
-      // Wavefunction(s) for diagram number 28
-      // (none)
-
-      // Amplitude(s) for diagram number 28
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 72 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 30 OF 72 ***
-
-      // Wavefunction(s) for diagram number 30
-      // (none)
-
-      // Amplitude(s) for diagram number 30
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 72 ***
-
-      // Wavefunction(s) for diagram number 31
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 32 OF 72 ***
-
-      // Wavefunction(s) for diagram number 32
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 72 ***
-
-      // Wavefunction(s) for diagram number 33
-      // (none)
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 72 ***
-
-      // Wavefunction(s) for diagram number 34
-      // (none)
-
-      // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 72 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 72 ***
-
-      // Wavefunction(s) for diagram number 36
-      // (none)
-
-      // Amplitude(s) for diagram number 36
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 37 OF 72 ***
-
-      // Wavefunction(s) for diagram number 37
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 38 OF 72 ***
-
-      // Wavefunction(s) for diagram number 38
-      // (none)
-
-      // Amplitude(s) for diagram number 38
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 39 OF 72 ***
-
-      // Wavefunction(s) for diagram number 39
-      // (none)
-
-      // Amplitude(s) for diagram number 39
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 40 OF 72 ***
-
-      // Wavefunction(s) for diagram number 40
-      // (none)
-
-      // Amplitude(s) for diagram number 40
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 41 OF 72 ***
-
-      // Wavefunction(s) for diagram number 41
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 42 OF 72 ***
-
-      // Wavefunction(s) for diagram number 42
-      // (none)
-
-      // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 43 OF 72 ***
-
-      // Wavefunction(s) for diagram number 43
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 44 OF 72 ***
-
-      // Wavefunction(s) for diagram number 44
-      // (none)
-
-      // Amplitude(s) for diagram number 44
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 45 OF 72 ***
-
-      // Wavefunction(s) for diagram number 45
-      // (none)
-
-      // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 46 OF 72 ***
-
-      // Wavefunction(s) for diagram number 46
-      // (none)
-
-      // Amplitude(s) for diagram number 46
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 47 OF 72 ***
-
-      // Wavefunction(s) for diagram number 47
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 47
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 48 OF 72 ***
-
-      // Wavefunction(s) for diagram number 48
-      // (none)
-
-      // Amplitude(s) for diagram number 48
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 48 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 49 OF 72 ***
-
-      // Wavefunction(s) for diagram number 49
-      // (none)
-
-      // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 50 OF 72 ***
-
-      // Wavefunction(s) for diagram number 50
-      // (none)
-
-      // Amplitude(s) for diagram number 50
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 51 OF 72 ***
-
-      // Wavefunction(s) for diagram number 51
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 52 OF 72 ***
-
-      // Wavefunction(s) for diagram number 52
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 53 OF 72 ***
-
-      // Wavefunction(s) for diagram number 53
-      // (none)
-
-      // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 54 OF 72 ***
-
-      // Wavefunction(s) for diagram number 54
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 55 OF 72 ***
-
-      // Wavefunction(s) for diagram number 55
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-
-      // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 56 OF 72 ***
-
-      // Wavefunction(s) for diagram number 56
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 57 OF 72 ***
-
-      // Wavefunction(s) for diagram number 57
-      // (none)
-
-      // Amplitude(s) for diagram number 57
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 58 OF 72 ***
-
-      // Wavefunction(s) for diagram number 58
-      // (none)
-
-      // Amplitude(s) for diagram number 58
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 59 OF 72 ***
-
-      // Wavefunction(s) for diagram number 59
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 59
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 60 OF 72 ***
-
-      // Wavefunction(s) for diagram number 60
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 60
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 61 OF 72 ***
-
-      // Wavefunction(s) for diagram number 61
-      // (none)
-
-      // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[10] += 1. / 6. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 62 OF 72 ***
-
-      // Wavefunction(s) for diagram number 62
-      // (none)
-
-      // Amplitude(s) for diagram number 62
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 63 OF 72 ***
-
-      // Wavefunction(s) for diagram number 63
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-
-      // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 64 OF 72 ***
-
-      // Wavefunction(s) for diagram number 64
-      // (none)
-
-      // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 65 OF 72 ***
-
-      // Wavefunction(s) for diagram number 65
-      // (none)
-
-      // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 66 OF 72 ***
-
-      // Wavefunction(s) for diagram number 66
-      // (none)
-
-      // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 67 OF 72 ***
-
-      // Wavefunction(s) for diagram number 67
-      // (none)
-
-      // Amplitude(s) for diagram number 67
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 68 OF 72 ***
-
-      // Wavefunction(s) for diagram number 68
-      // (none)
-
-      // Amplitude(s) for diagram number 68
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 69 OF 72 ***
-
-      // Wavefunction(s) for diagram number 69
-      // (none)
-
-      // Amplitude(s) for diagram number 69
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 70 OF 72 ***
-
-      // Wavefunction(s) for diagram number 70
-      // (none)
-
-      // Amplitude(s) for diagram number 70
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-      VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-      VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 71 OF 72 ***
-
-      // Wavefunction(s) for diagram number 71
-      // (none)
-
-      // Amplitude(s) for diagram number 71
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 72 OF 72 ***
-
-      // Wavefunction(s) for diagram number 72
-      // (none)
-
-      // Amplitude(s) for diagram number 72
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
-        { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
-        { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
-        { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
-        { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 72 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -1639,7 +636,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1674,6 +675,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1716,6 +721,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1818,26 +827,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1845,25 +854,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1876,7 +900,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1886,26 +910,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1916,17 +941,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1963,35 +994,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -2001,20 +1214,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -2026,17 +1233,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -2062,93 +1272,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -2190,7 +1370,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -2213,7 +1393,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -2222,25 +1402,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -2250,8 +1436,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -2267,11 +1455,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -2373,14 +1562,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
index d207c3303f..b147b40b3b 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 76;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f
index ef1e17705f..95e59c2089 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
index 2086a21e98..b046c442e3 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc
new file mode 100644
index 0000000000..065151d9f1
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.cc
@@ -0,0 +1,393 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
+    { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
+    { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
+    { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
+    { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagrams.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagrams.h
new file mode 100644
index 0000000000..d29bb82ea5
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/diagrams.h
@@ -0,0 +1,2237 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 72 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 );
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 72 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 72 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 72 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 72 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 72 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 6
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 72 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 7 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 72 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 8 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 72 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 9 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 72 ***
+    // Wavefunction(s) for diagram number 10
+    // (none)
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 10 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 72 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 11 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 72 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 12 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 72 ***
+    // Wavefunction(s) for diagram number 13
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 13 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 72 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 14 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 72 ***
+    // Wavefunction(s) for diagram number 15
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
+    // Amplitude(s) for diagram number 15
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 15 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 72 ***
+    // Wavefunction(s) for diagram number 16
+    // (none)
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 16 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 72 ***
+    // Wavefunction(s) for diagram number 17
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 17 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 72 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 18 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 72 ***
+    // Wavefunction(s) for diagram number 19
+    // (none)
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 19 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 72 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 20 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 72 ***
+    // Wavefunction(s) for diagram number 21
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 21 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 72 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 22 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 72 ***
+    // Wavefunction(s) for diagram number 23
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 23
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 23 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 72 ***
+    // Wavefunction(s) for diagram number 24
+    // (none)
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 24 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 72 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 25 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 72 ***
+    // Wavefunction(s) for diagram number 26
+    // (none)
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 26 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 72 ***
+    // Wavefunction(s) for diagram number 27
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 27 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 72 ***
+    // Wavefunction(s) for diagram number 28
+    // (none)
+    // Amplitude(s) for diagram number 28
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 28 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 72 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 29 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 72 ***
+    // Wavefunction(s) for diagram number 30
+    // (none)
+    // Amplitude(s) for diagram number 30
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 30 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 72 ***
+    // Wavefunction(s) for diagram number 31
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 31 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 72 ***
+    // Wavefunction(s) for diagram number 32
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 32 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 72 ***
+    // Wavefunction(s) for diagram number 33
+    // (none)
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 33 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 72 ***
+    // Wavefunction(s) for diagram number 34
+    // (none)
+    // Amplitude(s) for diagram number 34
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 34 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 72 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 35 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 72 ***
+    // Wavefunction(s) for diagram number 36
+    // (none)
+    // Amplitude(s) for diagram number 36
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 36 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram37( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 37 OF 72 ***
+    // Wavefunction(s) for diagram number 37
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 37
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 37 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram38( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 38 OF 72 ***
+    // Wavefunction(s) for diagram number 38
+    // (none)
+    // Amplitude(s) for diagram number 38
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 38 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram39( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 39 OF 72 ***
+    // Wavefunction(s) for diagram number 39
+    // (none)
+    // Amplitude(s) for diagram number 39
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 39 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram40( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 40 OF 72 ***
+    // Wavefunction(s) for diagram number 40
+    // (none)
+    // Amplitude(s) for diagram number 40
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 40 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram41( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 41 OF 72 ***
+    // Wavefunction(s) for diagram number 41
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 41
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 41 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram42( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 42 OF 72 ***
+    // Wavefunction(s) for diagram number 42
+    // (none)
+    // Amplitude(s) for diagram number 42
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 42 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram43( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 43 OF 72 ***
+    // Wavefunction(s) for diagram number 43
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 43
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 43 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram44( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 44 OF 72 ***
+    // Wavefunction(s) for diagram number 44
+    // (none)
+    // Amplitude(s) for diagram number 44
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 44 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram45( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 45 OF 72 ***
+    // Wavefunction(s) for diagram number 45
+    // (none)
+    // Amplitude(s) for diagram number 45
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 45 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram46( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 46 OF 72 ***
+    // Wavefunction(s) for diagram number 46
+    // (none)
+    // Amplitude(s) for diagram number 46
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 46 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram47( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 47 OF 72 ***
+    // Wavefunction(s) for diagram number 47
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 47
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 47 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram48( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 48 OF 72 ***
+    // Wavefunction(s) for diagram number 48
+    // (none)
+    // Amplitude(s) for diagram number 48
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 48 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram49( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 49 OF 72 ***
+    // Wavefunction(s) for diagram number 49
+    // (none)
+    // Amplitude(s) for diagram number 49
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 49 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram50( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 50 OF 72 ***
+    // Wavefunction(s) for diagram number 50
+    // (none)
+    // Amplitude(s) for diagram number 50
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 50 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram51( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 51 OF 72 ***
+    // Wavefunction(s) for diagram number 51
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 51
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 51 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram52( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 52 OF 72 ***
+    // Wavefunction(s) for diagram number 52
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 52
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 52 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram53( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 53 OF 72 ***
+    // Wavefunction(s) for diagram number 53
+    // (none)
+    // Amplitude(s) for diagram number 53
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 53 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram54( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 54 OF 72 ***
+    // Wavefunction(s) for diagram number 54
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 54
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 54 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram55( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 55 OF 72 ***
+    // Wavefunction(s) for diagram number 55
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    // Amplitude(s) for diagram number 55
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 55 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram56( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 56 OF 72 ***
+    // Wavefunction(s) for diagram number 56
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 56
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 56 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram57( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 57 OF 72 ***
+    // Wavefunction(s) for diagram number 57
+    // (none)
+    // Amplitude(s) for diagram number 57
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 57 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram58( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 58 OF 72 ***
+    // Wavefunction(s) for diagram number 58
+    // (none)
+    // Amplitude(s) for diagram number 58
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 58 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram59( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 59 OF 72 ***
+    // Wavefunction(s) for diagram number 59
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 59
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 59 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram60( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 60 OF 72 ***
+    // Wavefunction(s) for diagram number 60
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 60
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 60 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram61( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 61 OF 72 ***
+    // Wavefunction(s) for diagram number 61
+    // (none)
+    // Amplitude(s) for diagram number 61
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 61 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram62( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 62 OF 72 ***
+    // Wavefunction(s) for diagram number 62
+    // (none)
+    // Amplitude(s) for diagram number 62
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 62 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram63( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 63 OF 72 ***
+    // Wavefunction(s) for diagram number 63
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    // Amplitude(s) for diagram number 63
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 63 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram64( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 64 OF 72 ***
+    // Wavefunction(s) for diagram number 64
+    // (none)
+    // Amplitude(s) for diagram number 64
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 64 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram65( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 65 OF 72 ***
+    // Wavefunction(s) for diagram number 65
+    // (none)
+    // Amplitude(s) for diagram number 65
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 65 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram66( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 66 OF 72 ***
+    // Wavefunction(s) for diagram number 66
+    // (none)
+    // Amplitude(s) for diagram number 66
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 66 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram67( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 67 OF 72 ***
+    // Wavefunction(s) for diagram number 67
+    // (none)
+    // Amplitude(s) for diagram number 67
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram68( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 68 OF 72 ***
+    // Wavefunction(s) for diagram number 68
+    // (none)
+    // Amplitude(s) for diagram number 68
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 68 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram69( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 69 OF 72 ***
+    // Wavefunction(s) for diagram number 69
+    // (none)
+    // Amplitude(s) for diagram number 69
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 69 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram70( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 70 OF 72 ***
+    // Wavefunction(s) for diagram number 70
+    // (none)
+    // Amplitude(s) for diagram number 70
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+    VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+    VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram71( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 71 OF 72 ***
+    // Wavefunction(s) for diagram number 71
+    // (none)
+    // Amplitude(s) for diagram number 71
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 71 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram72( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 72 OF 72 ***
+    // Wavefunction(s) for diagram number 72
+    // (none)
+    // Amplitude(s) for diagram number 72
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 72 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f
index f7f23196eb..fc9392238e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
index 45032ad41c..a833594d67 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -355,7 +355,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -398,7 +398,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(34)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -441,111 +442,44 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  6) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,2.000000000000000D
-     $ +00,0.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  1),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1, 12) /48,32,32,12,0,32,-4,0,-12,-4,-4,12/
 C     1 T(1,2,3,4) T(5,6)
-      DATA (CF(I,  2),I=  1,  6) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,0.000000000000000D+00/
-      DATA (CF(I,  2),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 13, 23) /48,12,32,32,0,0,-4,-4,-12,12,-4/
 C     1 T(1,2,3,6) T(5,4)
-      DATA (CF(I,  3),I=  1,  6) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
-      DATA (CF(I,  3),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01/
+      DATA (CF(I),I= 24, 33) /48,32,-4,0,0,32,-4,12,-12,-4/
 C     1 T(1,2,5,4) T(3,6)
-      DATA (CF(I,  4),I=  1,  6) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,0.000000000000000D+00,-6.666666666666666D-01/
-      DATA (CF(I,  4),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00/
+      DATA (CF(I),I= 34, 42) /48,0,-4,32,0,12,-4,-4,-12/
 C     1 T(1,2,5,6) T(3,4)
-      DATA (CF(I,  5),I=  1,  6) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00/
-      DATA (CF(I,  5),I=  7, 12) /5.333333333333333D+00
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
+      DATA (CF(I),I= 43, 50) /48,32,32,12,0,-4,32,0/
 C     1 T(1,3,4) T(2,5,6)
-      DATA (CF(I,  6),I=  1,  6) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,1.600000000000000D
-     $ +01/
-      DATA (CF(I,  6),I=  7, 12) /2.000000000000000D+00
-     $ ,5.333333333333333D+00,-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
+      DATA (CF(I),I= 51, 57) /48,12,32,-4,0,0,32/
 C     1 T(1,3,6) T(2,5,4)
-      DATA (CF(I,  7),I=  1,  6) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
-      DATA (CF(I,  7),I=  7, 12) /1.600000000000000D+01
-     $ ,5.333333333333333D+00,5.333333333333333D+00,0.000000000000000D
-     $ +00,0.000000000000000D+00,-6.666666666666666D-01/
+      DATA (CF(I),I= 58, 63) /48,32,32,0,0,-4/
 C     1 T(1,5,4) T(2,3,6)
-      DATA (CF(I,  8),I=  1,  6) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00,2.000000000000000D+00,5.333333333333333D+00/
-      DATA (CF(I,  8),I=  7, 12) /5.333333333333333D+00
-     $ ,1.600000000000000D+01,0.000000000000000D+00,5.333333333333333D
-     $ +00,-6.666666666666666D-01,0.000000000000000D+00/
+      DATA (CF(I),I= 64, 68) /48,0,32,-4,0/
 C     1 T(1,5,6) T(2,3,4)
-      DATA (CF(I,  9),I=  1,  6) /-2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,2.000000000000000D+00,0.000000000000000D+00,
-     $ -6.666666666666666D-01/
-      DATA (CF(I,  9),I=  7, 12) /5.333333333333333D+00
-     $ ,0.000000000000000D+00,1.600000000000000D+01,5.333333333333333D
-     $ +00,5.333333333333333D+00,2.000000000000000D+00/
+      DATA (CF(I),I= 69, 72) /48,32,32,12/
 C     1 T(2,1,3,4) T(5,6)
-      DATA (CF(I, 10),I=  1,  6) /-6.666666666666666D-01,
-     $ -2.000000000000000D+00,2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01
-     $ ,0.000000000000000D+00/
-      DATA (CF(I, 10),I=  7, 12) /0.000000000000000D+00
-     $ ,5.333333333333333D+00,5.333333333333333D+00,1.600000000000000D
-     $ +01,2.000000000000000D+00,5.333333333333333D+00/
+      DATA (CF(I),I= 73, 75) /48,12,32/
 C     1 T(2,1,3,6) T(5,4)
-      DATA (CF(I, 11),I=  1,  6) /-6.666666666666666D-01
-     $ ,2.000000000000000D+00,-2.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,0.000000000000000D
-     $ +00/
-      DATA (CF(I, 11),I=  7, 12) /0.000000000000000D+00,
-     $ -6.666666666666666D-01,5.333333333333333D+00,2.000000000000000D
-     $ +00,1.600000000000000D+01,5.333333333333333D+00/
+      DATA (CF(I),I= 76, 77) /48,32/
 C     1 T(2,1,5,4) T(3,6)
-      DATA (CF(I, 12),I=  1,  6) /2.000000000000000D+00,
-     $ -6.666666666666666D-01,-6.666666666666666D-01,
-     $ -2.000000000000000D+00,0.000000000000000D+00,5.333333333333333D
-     $ +00/
-      DATA (CF(I, 12),I=  7, 12) /-6.666666666666666D-01
-     $ ,0.000000000000000D+00,2.000000000000000D+00,5.333333333333333D
-     $ +00,5.333333333333333D+00,1.600000000000000D+01/
+      DATA (CF(I),I= 78, 78) /48/
 C     1 T(2,1,5,6) T(3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -910,10 +844,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -922,6 +858,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
index 98fc59d3ea..d523fcab47 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV5_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV5P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV9_0( const fptype allV1[],
            const fptype allV2[],
@@ -962,7 +962,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV10_0( const fptype allV1[],
             const fptype allV2[],
@@ -975,7 +975,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV5_0( const fptype allV1[],
           const fptype allV2[],
@@ -988,7 +988,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1011,7 +1011,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV5P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1024,7 +1024,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1049,7 +1049,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1062,7 +1062,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1074,7 +1074,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1087,7 +1087,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1106,7 +1106,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1119,7 +1119,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1138,7 +1138,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1151,7 +1151,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1169,7 +1169,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1184,7 +1184,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
@@ -1199,7 +1199,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV9_0( const fptype allV1[],
            const fptype allV2[],
@@ -1214,7 +1214,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1229,7 +1229,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV10_0( const fptype allV1[],
             const fptype allV2[],
@@ -1244,7 +1244,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
index e394058ac8..eb2e5744ce 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
index 6d053c0d16..3f22a38896 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -657,7 +657,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -667,12 +667,12 @@ namespace mg5amcCpu
     using namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_7s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 );
-    fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
-    fptype* GC_8s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 );
-    cxtype_sv_ref GC_7s_sv = C_ACCESS::kernelAccess( GC_7s );
-    cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s );
-    cxtype_sv_ref GC_8s_sv = C_ACCESS::kernelAccess( GC_8s );
+    fptype* GC_7s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 );
+    fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
+    fptype* GC_8s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 );
+    cxtype_sv_ref GC_7s_sv = CD_ACCESS::kernelAccess( GC_7s );
+    cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s );
+    cxtype_sv_ref GC_8s_sv = CD_ACCESS::kernelAccess( GC_8s );
     GC_7s_sv = couplings_sv.GC_7;
     GC_6s_sv = couplings_sv.GC_6;
     GC_8s_sv = couplings_sv.GC_8;
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
index 5444229389..ad8d58b375 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/CODEGEN_cudacpp_smeft_gg_tttt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,14 +49,14 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model SMEFTsim_topU3l_MwScheme_UFO -massless_4t
 INFO: load particles 
 INFO: load vertices 
@@ -73,7 +73,7 @@ INFO: load vertices
 [1;32mDEBUG: MG5 converter defines FFFF26 to Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjP(-5,1)*ProjP(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjP(-5,3)*ProjP(-3,1) + Gamma(-2,-4,-3)*Gamma(-2,2,-6)*Gamma(-1,-6,-5)*Gamma(-1,4,-4)*ProjM(-5,1)*ProjM(-3,3) + Gamma(-2,-4,-3)*Gamma(-2,4,-6)*Gamma(-1,-6,-5)*Gamma(-1,2,-4)*ProjM(-5,3)*ProjM(-3,1) [0m
 [1;32mDEBUG: MG5 converter defines FFFF27 to ProjP(2,1)*ProjP(4,3) + ProjM(2,1)*ProjM(4,3) [0m
 [1;32mDEBUG: MG5 converter defines FFFF112 to ProjM(2,3)*ProjM(4,1) + ProjP(2,3)*ProjP(4,1) [0m
-[1;32mDEBUG: model prefixing  takes 0.12831377983093262 [0m
+[1;32mDEBUG: model prefixing  takes 0.1275167465209961 [0m
 INFO: Change particles name to pass to MG5 convention 
 Defined multiparticle p = g u c d s u~ c~ d~ s~
 Defined multiparticle j = g u c d s u~ c~ d~ s~
@@ -88,33 +88,33 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED+99*SMHLOOP+99*NP+99*NPshifts+99*NPprop+99*NPcpv+NPcbb+NPcbB+NPcbBB+NPcbd1+NPcbd8+NPcbe+NPcbG+NPcbH+NPcbj1+NPcbj8+NPcbl+NPcbu1+NPcbu8+NPcbW+NPcdB+NPcdd1+NPcdd8+NPcdG+NPcdH+NPcdW+NPceB+NPced+NPcee+NPceH+NPceu+NPceW+NPcG+NPcGtil+NPcH+NPcHB+NPcHbox+NPcHbq+NPcHBtil+NPcHd+NPcHDD+NPcHe+NPcHG+NPcHGtil+NPcHj1+NPcHj3+NPcHl1+NPcHl3+NPcHQ1+NPcHQ3+NPcHt+NPcHtb+NPcHu+NPcHud+NPcHW+NPcHWB+NPcHWBtil+NPcHWtil+NPcjd1+NPcjd8+NPcje+NPcjj11+NPcjj18+NPcjj31+NPcjj38+NPcjQbd1+NPcjQbd8+NPcjQtu1+NPcjQtu8+NPcjtQd1+NPcjtQd8+NPcju1+NPcju8+NPcjujd1+NPcjujd11+NPcjujd8+NPcjujd81+NPcjuQb1+NPcjuQb8+NPcld+NPcle+NPclebQ+NPcledj+NPcleju1+NPcleju3+NPcleQt1+NPcleQt3+NPclj1+NPclj3+NPcll+NPcll1+NPclu+NPcQb1+NPcQb8+NPcQd1+NPcQd8+NPcQe+NPcQj11+NPcQj18+NPcQj31+NPcQj38+NPcQl1+NPcQl3+NPcQQ1+NPcQQ8+NPcQt1+NPcQt8+NPcQtjd1+NPcQtjd8+NPcQtQb1+NPcQtQb8+NPcQu1+NPcQu8+NPcQujb1+NPcQujb8+NPctB+NPctb1+NPctb8+NPctd1+NPctd8+NPcte+NPctG+NPctH+NPctj1+NPctj8+NPctl+NPctt+NPctu1+NPctu8+NPctW+NPcuB+NPcud1+NPcud8+NPcuG+NPcuH+NPcutbd1+NPcutbd8+NPcuu1+NPcuu8+NPcuW+NPcW+NPcWtil+NPQjujb8 
 INFO: Trying process: g g > t t~ t t~ WEIGHTED<=4 @1  
 INFO: Process has 72 diagrams 
-1 processes with 72 diagrams generated in 3.671 s
+1 processes with 72 diagrams generated in 3.713 s
 Total: 1 processes with 72 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ t t~ WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
-Generated helas calls for 1 subprocesses (72 diagrams) in 0.186 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/. 
+Generated helas calls for 1 subprocesses (72 diagrams) in 0.184 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV5 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV9 routines[0m
 ALOHA: aloha creates VVVV10 routines[0m
-ALOHA: aloha creates 5 routines in  0.316 s
+ALOHA: aloha creates 5 routines in  0.305 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> VVV5
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -124,17 +124,17 @@ ALOHA: aloha creates 5 routines in  0.316 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV9
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV10
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
-INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+INFO: Created file HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/./Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
 INFO: Created files Parameters_SMEFTsim_topU3l_MwScheme_UFO.h and Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_smeft_gg_tttt/src/. 
 quit
 
-real	0m5.073s
-user	0m4.975s
-sys	0m0.073s
+real	0m5.095s
+user	0m4.971s
+sys	0m0.077s
 Code generation completed in 5 seconds
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h
index a4f3a481bb..84c20a1f30 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
index 2f711d8cc1..7f3a4e3dca 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
index 6a64c39915..42eaa96778 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_SMEFTsim_topU3l_MwScheme_UFO_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 12;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,1281 +279,277 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 72 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 );
-
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 72 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 72 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
-
-      // Amplitude(s) for diagram number 3
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 4 OF 72 ***
-
-      // Wavefunction(s) for diagram number 4
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 4
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 5 OF 72 ***
-
-      // Wavefunction(s) for diagram number 5
-      // (none)
-
-      // Amplitude(s) for diagram number 5
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 6 OF 72 ***
-
-      // Wavefunction(s) for diagram number 6
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] );
-
-      // Amplitude(s) for diagram number 6
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 7 OF 72 ***
-
-      // Wavefunction(s) for diagram number 7
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 7
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 8 OF 72 ***
-
-      // Wavefunction(s) for diagram number 8
-      // (none)
-
-      // Amplitude(s) for diagram number 8
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 9 OF 72 ***
-
-      // Wavefunction(s) for diagram number 9
-      // (none)
-
-      // Amplitude(s) for diagram number 9
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 10 OF 72 ***
-
-      // Wavefunction(s) for diagram number 10
-      // (none)
-
-      // Amplitude(s) for diagram number 10
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 11 OF 72 ***
-
-      // Wavefunction(s) for diagram number 11
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 11
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 12 OF 72 ***
-
-      // Wavefunction(s) for diagram number 12
-      // (none)
-
-      // Amplitude(s) for diagram number 12
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 13 OF 72 ***
-
-      // Wavefunction(s) for diagram number 13
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 13
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 14 OF 72 ***
-
-      // Wavefunction(s) for diagram number 14
-      // (none)
-
-      // Amplitude(s) for diagram number 14
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 15 OF 72 ***
-
-      // Wavefunction(s) for diagram number 15
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
-
-      // Amplitude(s) for diagram number 15
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 16 OF 72 ***
-
-      // Wavefunction(s) for diagram number 16
-      // (none)
-
-      // Amplitude(s) for diagram number 16
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 17 OF 72 ***
-
-      // Wavefunction(s) for diagram number 17
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-
-      // Amplitude(s) for diagram number 17
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 18 OF 72 ***
-
-      // Wavefunction(s) for diagram number 18
-      // (none)
-
-      // Amplitude(s) for diagram number 18
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 19 OF 72 ***
-
-      // Wavefunction(s) for diagram number 19
-      // (none)
-
-      // Amplitude(s) for diagram number 19
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 20 OF 72 ***
-
-      // Wavefunction(s) for diagram number 20
-      // (none)
-
-      // Amplitude(s) for diagram number 20
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 21 OF 72 ***
-
-      // Wavefunction(s) for diagram number 21
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 21
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 22 OF 72 ***
-
-      // Wavefunction(s) for diagram number 22
-      // (none)
-
-      // Amplitude(s) for diagram number 22
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 23 OF 72 ***
-
-      // Wavefunction(s) for diagram number 23
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 23
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 24 OF 72 ***
-
-      // Wavefunction(s) for diagram number 24
-      // (none)
-
-      // Amplitude(s) for diagram number 24
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 25 OF 72 ***
-
-      // Wavefunction(s) for diagram number 25
-      // (none)
-
-      // Amplitude(s) for diagram number 25
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= 1. / 6. * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 26 OF 72 ***
-
-      // Wavefunction(s) for diagram number 26
-      // (none)
-
-      // Amplitude(s) for diagram number 26
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 27 OF 72 ***
-
-      // Wavefunction(s) for diagram number 27
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-
-      // Amplitude(s) for diagram number 27
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 28 OF 72 ***
-
-      // Wavefunction(s) for diagram number 28
-      // (none)
-
-      // Amplitude(s) for diagram number 28
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 29 OF 72 ***
-
-      // Wavefunction(s) for diagram number 29
-      // (none)
-
-      // Amplitude(s) for diagram number 29
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 30 OF 72 ***
-
-      // Wavefunction(s) for diagram number 30
-      // (none)
-
-      // Amplitude(s) for diagram number 30
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 31 OF 72 ***
-
-      // Wavefunction(s) for diagram number 31
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
-
-      // Amplitude(s) for diagram number 31
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 32 OF 72 ***
-
-      // Wavefunction(s) for diagram number 32
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 32
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 33 OF 72 ***
-
-      // Wavefunction(s) for diagram number 33
-      // (none)
-
-      // Amplitude(s) for diagram number 33
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 34 OF 72 ***
-
-      // Wavefunction(s) for diagram number 34
-      // (none)
-
-      // Amplitude(s) for diagram number 34
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 35 OF 72 ***
-
-      // Wavefunction(s) for diagram number 35
-      // (none)
-
-      // Amplitude(s) for diagram number 35
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 36 OF 72 ***
-
-      // Wavefunction(s) for diagram number 36
-      // (none)
-
-      // Amplitude(s) for diagram number 36
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 37 OF 72 ***
-
-      // Wavefunction(s) for diagram number 37
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
-
-      // Amplitude(s) for diagram number 37
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 38 OF 72 ***
-
-      // Wavefunction(s) for diagram number 38
-      // (none)
-
-      // Amplitude(s) for diagram number 38
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 39 OF 72 ***
-
-      // Wavefunction(s) for diagram number 39
-      // (none)
-
-      // Amplitude(s) for diagram number 39
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 40 OF 72 ***
-
-      // Wavefunction(s) for diagram number 40
-      // (none)
-
-      // Amplitude(s) for diagram number 40
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 41 OF 72 ***
-
-      // Wavefunction(s) for diagram number 41
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 41
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 42 OF 72 ***
-
-      // Wavefunction(s) for diagram number 42
-      // (none)
-
-      // Amplitude(s) for diagram number 42
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 43 OF 72 ***
-
-      // Wavefunction(s) for diagram number 43
-      FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 43
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-      jamp_sv[7] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 44 OF 72 ***
-
-      // Wavefunction(s) for diagram number 44
-      // (none)
-
-      // Amplitude(s) for diagram number 44
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 45 OF 72 ***
-
-      // Wavefunction(s) for diagram number 45
-      // (none)
-
-      // Amplitude(s) for diagram number 45
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += 1. / 6. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 46 OF 72 ***
-
-      // Wavefunction(s) for diagram number 46
-      // (none)
-
-      // Amplitude(s) for diagram number 46
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 47 OF 72 ***
-
-      // Wavefunction(s) for diagram number 47
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
-
-      // Amplitude(s) for diagram number 47
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 48 OF 72 ***
-
-      // Wavefunction(s) for diagram number 48
-      // (none)
-
-      // Amplitude(s) for diagram number 48
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 49 OF 72 ***
-
-      // Wavefunction(s) for diagram number 49
-      // (none)
-
-      // Amplitude(s) for diagram number 49
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 50 OF 72 ***
-
-      // Wavefunction(s) for diagram number 50
-      // (none)
-
-      // Amplitude(s) for diagram number 50
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 51 OF 72 ***
-
-      // Wavefunction(s) for diagram number 51
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
-
-      // Amplitude(s) for diagram number 51
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 52 OF 72 ***
-
-      // Wavefunction(s) for diagram number 52
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] );
-
-      // Amplitude(s) for diagram number 52
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[7] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 53 OF 72 ***
-
-      // Wavefunction(s) for diagram number 53
-      // (none)
-
-      // Amplitude(s) for diagram number 53
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[8] -= 1. / 6. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 54 OF 72 ***
-
-      // Wavefunction(s) for diagram number 54
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] );
-
-      // Amplitude(s) for diagram number 54
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[6] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 55 OF 72 ***
-
-      // Wavefunction(s) for diagram number 55
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
-
-      // Amplitude(s) for diagram number 55
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[2] += 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 56 OF 72 ***
-
-      // Wavefunction(s) for diagram number 56
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] );
-
-      // Amplitude(s) for diagram number 56
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 57 OF 72 ***
-
-      // Wavefunction(s) for diagram number 57
-      // (none)
-
-      // Amplitude(s) for diagram number 57
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 6. * amp_sv[0];
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 58 OF 72 ***
-
-      // Wavefunction(s) for diagram number 58
-      // (none)
-
-      // Amplitude(s) for diagram number 58
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 59 OF 72 ***
-
-      // Wavefunction(s) for diagram number 59
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
-
-      // Amplitude(s) for diagram number 59
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 60 OF 72 ***
-
-      // Wavefunction(s) for diagram number 60
-      VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] );
-
-      // Amplitude(s) for diagram number 60
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[5] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[10] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 61 OF 72 ***
-
-      // Wavefunction(s) for diagram number 61
-      // (none)
-
-      // Amplitude(s) for diagram number 61
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[10] += 1. / 6. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 62 OF 72 ***
-
-      // Wavefunction(s) for diagram number 62
-      // (none)
-
-      // Amplitude(s) for diagram number 62
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[4] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[11] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 63 OF 72 ***
-
-      // Wavefunction(s) for diagram number 63
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
-
-      // Amplitude(s) for diagram number 63
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[3] -= 1. / 6. * amp_sv[0];
-
-      // *** DIAGRAM 64 OF 72 ***
-
-      // Wavefunction(s) for diagram number 64
-      // (none)
-
-      // Amplitude(s) for diagram number 64
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 65 OF 72 ***
-
-      // Wavefunction(s) for diagram number 65
-      // (none)
-
-      // Amplitude(s) for diagram number 65
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 6. * amp_sv[0];
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 66 OF 72 ***
-
-      // Wavefunction(s) for diagram number 66
-      // (none)
-
-      // Amplitude(s) for diagram number 66
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 67 OF 72 ***
-
-      // Wavefunction(s) for diagram number 67
-      // (none)
-
-      // Amplitude(s) for diagram number 67
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[2] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-      VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 68 OF 72 ***
-
-      // Wavefunction(s) for diagram number 68
-      // (none)
-
-      // Amplitude(s) for diagram number 68
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[10] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 69 OF 72 ***
-
-      // Wavefunction(s) for diagram number 69
-      // (none)
-
-      // Amplitude(s) for diagram number 69
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[2] += 1. / 2. * amp_sv[0];
-      jamp_sv[5] -= 1. / 2. * amp_sv[0];
-      jamp_sv[6] -= 1. / 2. * amp_sv[0];
-      jamp_sv[9] += 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 70 OF 72 ***
-
-      // Wavefunction(s) for diagram number 70
-      // (none)
-
-      // Amplitude(s) for diagram number 70
-      VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[3] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-      VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-      VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 71 OF 72 ***
-
-      // Wavefunction(s) for diagram number 71
-      // (none)
-
-      // Amplitude(s) for diagram number 71
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[11] -= 1. / 2. * amp_sv[0];
-
-      // *** DIAGRAM 72 OF 72 ***
-
-      // Wavefunction(s) for diagram number 72
-      // (none)
-
-      // Amplitude(s) for diagram number 72
-      VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[3] -= 1. / 2. * amp_sv[0];
-      jamp_sv[4] += 1. / 2. * amp_sv[0];
-      jamp_sv[7] += 1. / 2. * amp_sv[0];
-      jamp_sv[8] -= 1. / 2. * amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttxttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
-
-      // The color matrix (initialize all array elements, with ncolor=12)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
-        { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
-        { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
-        { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
-        { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
-        { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
-        { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
-        { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
-        { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
-        { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
-        { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
-        { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
-
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
+#endif
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
+
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
+
+      // *** DIAGRAMS 1 TO 72 ***
+#ifdef MGONGPUCPP_GPUIMPL
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram7, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram8, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram9, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram10, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram11, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram12, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram13, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram14, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram15, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram16, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram17, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram18, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram19, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram20, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram21, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram22, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram23, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram24, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram25, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram26, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram27, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram28, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram29, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram30, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram31, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram32, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram33, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram34, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram35, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram36, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram37, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram38, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram39, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram40, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram41, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram42, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram43, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram44, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram45, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram46, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram47, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram48, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram49, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram50, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram51, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram52, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram53, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram54, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram55, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram56, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram57, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram58, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram59, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram60, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram61, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram62, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram63, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram64, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram65, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram66, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram67, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram68, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram69, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram70, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram71, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram72, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+#else
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram7( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram8( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram9( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram10( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram11( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram12( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram13( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram14( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram15( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram16( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram17( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram18( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram19( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram20( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram21( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram22( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram23( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram24( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram25( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram26( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram27( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram28( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram29( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram30( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram31( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram32( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram33( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram34( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram35( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram36( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram37( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram38( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram39( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram40( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram41( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram42( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram43( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram44( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram45( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram46( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram47( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram48( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram49( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram50( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram51( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram52( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram53( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram54( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram55( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram56( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram57( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram58( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram59( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram60( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram61( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram62( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram63( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram64( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram65( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram66( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram67( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram68( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram69( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram70( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram71( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram72( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -1587,7 +636,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -1622,6 +675,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -1664,6 +721,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
     m_masses.push_back( Parameters_SMEFTsim_topU3l_MwScheme_UFO::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -1766,26 +827,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -1793,25 +854,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -1824,7 +900,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1834,26 +910,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -1864,17 +941,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -1911,35 +994,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -1956,13 +1221,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 1024 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -1974,17 +1233,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -2010,93 +1272,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -2138,7 +1370,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -2161,7 +1393,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -2170,25 +1402,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -2198,8 +1436,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -2215,11 +1455,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -2321,14 +1562,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
index d207c3303f..b147b40b3b 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_SMEFTsim_topU3l_MwScheme_UFO.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 64; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 72; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 12; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 18; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 6; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 76;
     //static const int ncomb = 64; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc
new file mode 100644
index 0000000000..065151d9f1
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.cc
@@ -0,0 +1,393 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; // 1-D array[12]
+
+  // The color matrix (initialize all array elements, with ncolor=12)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 48, 16, 16, 6, 0, 16, -2, 0, -6, -2, -2, 6 },
+    { 16, 48, 6, 16, 16, 0, 0, -2, -2, -6, 6, -2 },
+    { 16, 6, 48, 16, -2, 0, 0, 16, -2, 6, -6, -2 },
+    { 6, 16, 16, 48, 0, -2, 16, 0, 6, -2, -2, -6 },
+    { 0, 16, -2, 0, 48, 16, 16, 6, 0, -2, 16, 0 },
+    { 16, 0, 0, -2, 16, 48, 6, 16, -2, 0, 0, 16 },
+    { -2, 0, 0, 16, 16, 6, 48, 16, 16, 0, 0, -2 },
+    { 0, -2, 16, 0, 6, 16, 16, 48, 0, 16, -2, 0 },
+    { -6, -2, -2, 6, 0, -2, 16, 0, 48, 16, 16, 6 },
+    { -2, -6, 6, -2, -2, 0, 0, 16, 16, 48, 6, 16 },
+    { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 },
+    { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagrams.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagrams.h
new file mode 100644
index 0000000000..9b06366348
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/P1_Sigma_SMEFTsim_topU3l_MwScheme_UFO_gg_ttxttx/diagrams.h
@@ -0,0 +1,2185 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 72 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][4], +1, w_fp[4], 4 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][5], -1, w_fp[5], 5 );
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[7] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[8] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[8], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 72 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 72 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[10] );
+    // Amplitude(s) for diagram number 3
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[7], w_fp[10], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 72 ***
+    // Wavefunction(s) for diagram number 4
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[11] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 4
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 72 ***
+    // Wavefunction(s) for diagram number 5
+    // (none)
+    // Amplitude(s) for diagram number 5
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[8], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 72 ***
+    // Wavefunction(s) for diagram number 6
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[8] );
+    // Amplitude(s) for diagram number 6
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[11], w_fp[8], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram7( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 7 OF 72 ***
+    // Wavefunction(s) for diagram number 7
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[6], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 7
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram8( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 8 OF 72 ***
+    // Wavefunction(s) for diagram number 8
+    // (none)
+    // Amplitude(s) for diagram number 8
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[9], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram9( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 9 OF 72 ***
+    // Wavefunction(s) for diagram number 9
+    // (none)
+    // Amplitude(s) for diagram number 9
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram10( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 10 OF 72 ***
+    // Wavefunction(s) for diagram number 10
+    // (none)
+    // Amplitude(s) for diagram number 10
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram11( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 11 OF 72 ***
+    // Wavefunction(s) for diagram number 11
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 11
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram12( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 12 OF 72 ***
+    // Wavefunction(s) for diagram number 12
+    // (none)
+    // Amplitude(s) for diagram number 12
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram13( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 13 OF 72 ***
+    // Wavefunction(s) for diagram number 13
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 13
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram14( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 14 OF 72 ***
+    // Wavefunction(s) for diagram number 14
+    // (none)
+    // Amplitude(s) for diagram number 14
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram15( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 15 OF 72 ***
+    // Wavefunction(s) for diagram number 15
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[15] );
+    // Amplitude(s) for diagram number 15
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram16( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 16 OF 72 ***
+    // Wavefunction(s) for diagram number 16
+    // (none)
+    // Amplitude(s) for diagram number 16
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram17( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 17 OF 72 ***
+    // Wavefunction(s) for diagram number 17
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    // Amplitude(s) for diagram number 17
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram18( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 18 OF 72 ***
+    // Wavefunction(s) for diagram number 18
+    // (none)
+    // Amplitude(s) for diagram number 18
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram19( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 19 OF 72 ***
+    // Wavefunction(s) for diagram number 19
+    // (none)
+    // Amplitude(s) for diagram number 19
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram20( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 20 OF 72 ***
+    // Wavefunction(s) for diagram number 20
+    // (none)
+    // Amplitude(s) for diagram number 20
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram21( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 21 OF 72 ***
+    // Wavefunction(s) for diagram number 21
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 21
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram22( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 22 OF 72 ***
+    // Wavefunction(s) for diagram number 22
+    // (none)
+    // Amplitude(s) for diagram number 22
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[16], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram23( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 23 OF 72 ***
+    // Wavefunction(s) for diagram number 23
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 23
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram24( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 24 OF 72 ***
+    // Wavefunction(s) for diagram number 24
+    // (none)
+    // Amplitude(s) for diagram number 24
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[14], w_fp[6], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram25( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 25 OF 72 ***
+    // Wavefunction(s) for diagram number 25
+    // (none)
+    // Amplitude(s) for diagram number 25
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram26( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 26 OF 72 ***
+    // Wavefunction(s) for diagram number 26
+    // (none)
+    // Amplitude(s) for diagram number 26
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram27( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 27 OF 72 ***
+    // Wavefunction(s) for diagram number 27
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[14], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    // Amplitude(s) for diagram number 27
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram28( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 28 OF 72 ***
+    // Wavefunction(s) for diagram number 28
+    // (none)
+    // Amplitude(s) for diagram number 28
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram29( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 29 OF 72 ***
+    // Wavefunction(s) for diagram number 29
+    // (none)
+    // Amplitude(s) for diagram number 29
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram30( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 30 OF 72 ***
+    // Wavefunction(s) for diagram number 30
+    // (none)
+    // Amplitude(s) for diagram number 30
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram31( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 31 OF 72 ***
+    // Wavefunction(s) for diagram number 31
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[4], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[17] );
+    // Amplitude(s) for diagram number 31
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram32( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 32 OF 72 ***
+    // Wavefunction(s) for diagram number 32
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[12], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 32
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram33( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 33 OF 72 ***
+    // Wavefunction(s) for diagram number 33
+    // (none)
+    // Amplitude(s) for diagram number 33
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram34( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 34 OF 72 ***
+    // Wavefunction(s) for diagram number 34
+    // (none)
+    // Amplitude(s) for diagram number 34
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[12], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram35( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 35 OF 72 ***
+    // Wavefunction(s) for diagram number 35
+    // (none)
+    // Amplitude(s) for diagram number 35
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[17], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram36( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 36 OF 72 ***
+    // Wavefunction(s) for diagram number 36
+    // (none)
+    // Amplitude(s) for diagram number 36
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[12], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram37( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 37 OF 72 ***
+    // Wavefunction(s) for diagram number 37
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[12], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[14] );
+    // Amplitude(s) for diagram number 37
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[14], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram38( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 38 OF 72 ***
+    // Wavefunction(s) for diagram number 38
+    // (none)
+    // Amplitude(s) for diagram number 38
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram39( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 39 OF 72 ***
+    // Wavefunction(s) for diagram number 39
+    // (none)
+    // Amplitude(s) for diagram number 39
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[14], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram40( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 40 OF 72 ***
+    // Wavefunction(s) for diagram number 40
+    // (none)
+    // Amplitude(s) for diagram number 40
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[17], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram41( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 41 OF 72 ***
+    // Wavefunction(s) for diagram number 41
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[5], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[17] );
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[4], COUPs[1], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 41
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram42( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 42 OF 72 ***
+    // Wavefunction(s) for diagram number 42
+    // (none)
+    // Amplitude(s) for diagram number 42
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[16], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram43( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 43 OF 72 ***
+    // Wavefunction(s) for diagram number 43
+    FFV1P0_3<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[2], COUPs[1], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 43
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram44( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 44 OF 72 ***
+    // Wavefunction(s) for diagram number 44
+    // (none)
+    // Amplitude(s) for diagram number 44
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram45( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 45 OF 72 ***
+    // Wavefunction(s) for diagram number 45
+    // (none)
+    // Amplitude(s) for diagram number 45
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram46( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 46 OF 72 ***
+    // Wavefunction(s) for diagram number 46
+    // (none)
+    // Amplitude(s) for diagram number 46
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[17], w_fp[6], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram47( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 47 OF 72 ***
+    // Wavefunction(s) for diagram number 47
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[17], w_fp[1], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[12] );
+    // Amplitude(s) for diagram number 47
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram48( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 48 OF 72 ***
+    // Wavefunction(s) for diagram number 48
+    // (none)
+    // Amplitude(s) for diagram number 48
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram49( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 49 OF 72 ***
+    // Wavefunction(s) for diagram number 49
+    // (none)
+    // Amplitude(s) for diagram number 49
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[12], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram50( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 50 OF 72 ***
+    // Wavefunction(s) for diagram number 50
+    // (none)
+    // Amplitude(s) for diagram number 50
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram51( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 51 OF 72 ***
+    // Wavefunction(s) for diagram number 51
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[16], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[9] );
+    // Amplitude(s) for diagram number 51
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[9], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram52( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 52 OF 72 ***
+    // Wavefunction(s) for diagram number 52
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[8], COUPs[0], 1.0, 0., 0., w_fp[12] );
+    // Amplitude(s) for diagram number 52
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[16], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram53( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 53 OF 72 ***
+    // Wavefunction(s) for diagram number 53
+    // (none)
+    // Amplitude(s) for diagram number 53
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[9], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram54( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 54 OF 72 ***
+    // Wavefunction(s) for diagram number 54
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[10], COUPs[0], 1.0, 0., 0., w_fp[9] );
+    // Amplitude(s) for diagram number 54
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[16], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram55( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 55 OF 72 ***
+    // Wavefunction(s) for diagram number 55
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[13], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[16] );
+    // Amplitude(s) for diagram number 55
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[4], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram56( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 56 OF 72 ***
+    // Wavefunction(s) for diagram number 56
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[11], COUPs[0], 1.0, 0., 0., w_fp[14] );
+    // Amplitude(s) for diagram number 56
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[4], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram57( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 57 OF 72 ***
+    // Wavefunction(s) for diagram number 57
+    // (none)
+    // Amplitude(s) for diagram number 57
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[16], w_fp[2], w_fp[10], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram58( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 58 OF 72 ***
+    // Wavefunction(s) for diagram number 58
+    // (none)
+    // Amplitude(s) for diagram number 58
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[13], w_fp[2], w_fp[9], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram59( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 59 OF 72 ***
+    // Wavefunction(s) for diagram number 59
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[6], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[13] );
+    // Amplitude(s) for diagram number 59
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[13], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram60( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 60 OF 72 ***
+    // Wavefunction(s) for diagram number 60
+    VVV5P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[7], COUPs[0], 1.0, 0., 0., w_fp[16] );
+    // Amplitude(s) for diagram number 60
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[5], w_fp[6], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram61( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 61 OF 72 ***
+    // Wavefunction(s) for diagram number 61
+    // (none)
+    // Amplitude(s) for diagram number 61
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[13], w_fp[11], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram62( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 62 OF 72 ***
+    // Wavefunction(s) for diagram number 62
+    // (none)
+    // Amplitude(s) for diagram number 62
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[6], w_fp[14], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram63( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 63 OF 72 ***
+    // Wavefunction(s) for diagram number 63
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[15], w_fp[0], COUPs[1], 1.0, cIPD[0], cIPD[1], w_fp[6] );
+    // Amplitude(s) for diagram number 63
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[4], w_fp[7], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 6. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram64( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 64 OF 72 ***
+    // Wavefunction(s) for diagram number 64
+    // (none)
+    // Amplitude(s) for diagram number 64
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[4], w_fp[16], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram65( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 65 OF 72 ***
+    // Wavefunction(s) for diagram number 65
+    // (none)
+    // Amplitude(s) for diagram number 65
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[6], w_fp[2], w_fp[8], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 6. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram66( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 66 OF 72 ***
+    // Wavefunction(s) for diagram number 66
+    // (none)
+    // Amplitude(s) for diagram number 66
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[15], w_fp[2], w_fp[12], COUPs[1], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram67( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 67 OF 72 ***
+    // Wavefunction(s) for diagram number 67
+    // (none)
+    // Amplitude(s) for diagram number 67
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+    VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[7], w_fp[10], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram68( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 68 OF 72 ***
+    // Wavefunction(s) for diagram number 68
+    // (none)
+    // Amplitude(s) for diagram number 68
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[10], w_fp[16], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 10 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram69( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 69 OF 72 ***
+    // Wavefunction(s) for diagram number 69
+    // (none)
+    // Amplitude(s) for diagram number 69
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[7], w_fp[9], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 2 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 5 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 6 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 9 ) += 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram70( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 70 OF 72 ***
+    // Wavefunction(s) for diagram number 70
+    // (none)
+    // Amplitude(s) for diagram number 70
+    VVVV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+    VVVV9_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+    VVVV10_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[11], w_fp[8], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram71( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 71 OF 72 ***
+    // Wavefunction(s) for diagram number 71
+    // (none)
+    // Amplitude(s) for diagram number 71
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[8], w_fp[14], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 11 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram72( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+             fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+             const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+             const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+             const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+             fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+             fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 72 OF 72 ***
+    // Wavefunction(s) for diagram number 72
+    // (none)
+    // Amplitude(s) for diagram number 72
+    VVV5_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[11], w_fp[12], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 3 ) -= 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 4 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 7 ) += 1. / 2. * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 8 ) -= 1. / 2. * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
index 98fc59d3ea..d523fcab47 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/HelAmps_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV5_0( const fptype allV1[],
           const fptype allV2[],
@@ -872,7 +872,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV5P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -897,7 +897,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -910,7 +910,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -923,7 +923,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -936,7 +936,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV9_0( const fptype allV1[],
            const fptype allV2[],
@@ -962,7 +962,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVVV10_0( const fptype allV1[],
             const fptype allV2[],
@@ -975,7 +975,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVV5_0( const fptype allV1[],
           const fptype allV2[],
@@ -988,7 +988,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P1[4] = { +cxreal( V1[0] ), +cxreal( V1[1] ), +cximag( V1[1] ), +cximag( V1[0] ) };
@@ -1011,7 +1011,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV5P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -1024,7 +1024,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -1049,7 +1049,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -1062,7 +1062,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP9 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -1074,7 +1074,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -1087,7 +1087,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1106,7 +1106,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1119,7 +1119,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
@@ -1138,7 +1138,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'V3[6]' from the input wavefunctions F1[6], F2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1P0_3( const fptype allF1[],
             const fptype allF2[],
@@ -1151,7 +1151,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V3 = W_ACCESS::kernelAccess( allV3 );
     const cxtype cI = cxmake( 0., 1. );
     V3[0] = +F1[0] + F2[0];
@@ -1169,7 +1169,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1184,7 +1184,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP10 = ( V1[2] * V4[2] - V1[3] * V4[3] - V1[4] * V4[4] - V1[5] * V4[5] );
@@ -1199,7 +1199,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV9_0( const fptype allV1[],
            const fptype allV2[],
@@ -1214,7 +1214,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
@@ -1229,7 +1229,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], V3[6], V4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVVV10_0( const fptype allV1[],
             const fptype allV2[],
@@ -1244,7 +1244,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
     const cxtype_sv* V4 = W_ACCESS::kernelAccessConst( allV4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP1 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
index e394058ac8..eb2e5744ce 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
index 6d053c0d16..3f22a38896 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/Parameters_SMEFTsim_topU3l_MwScheme_UFO.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -657,7 +657,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -667,12 +667,12 @@ namespace mg5amcCpu
     using namespace Parameters_SMEFTsim_topU3l_MwScheme_UFO_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_7s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 );
-    fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
-    fptype* GC_8s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 );
-    cxtype_sv_ref GC_7s_sv = C_ACCESS::kernelAccess( GC_7s );
-    cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s );
-    cxtype_sv_ref GC_8s_sv = C_ACCESS::kernelAccess( GC_8s );
+    fptype* GC_7s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_7 );
+    fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
+    fptype* GC_8s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_8 );
+    cxtype_sv_ref GC_7s_sv = CD_ACCESS::kernelAccess( GC_7s );
+    cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s );
+    cxtype_sv_ref GC_8s_sv = CD_ACCESS::kernelAccess( GC_8s );
     GC_7s_sv = couplings_sv.GC_7;
     GC_6s_sv = couplings_sv.GC_6;
     GC_8s_sv = couplings_sv.GC_8;
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
index 1690ef1273..1aa898b488 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/CODEGEN_mad_susy_gg_t1t1_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -550,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.124 s
+1 processes with 6 diagrams generated in 0.113 s
 Total: 1 processes with 6 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_t1t1 --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_t1t1 
 INFO: remove old information in CODEGEN_mad_susy_gg_t1t1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
@@ -576,57 +576,57 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_t1t1x 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 5 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 2, 2: 3, 3: 4, 4: 5, 5: 6} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {2: 1, 3: 2, 4: 3, 5: 4, 6: 5} [1;30m[model_handling.py at line 1665][0m [0m
 Generated helas calls for 1 subprocesses (6 diagrams) in 0.009 s
-Wrote files for 16 helas calls in 0.082 s
+Wrote files for 16 helas calls in 0.084 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.186 s
+ALOHA: aloha creates 3 routines in  0.177 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 6 routines in  0.184 s
+ALOHA: aloha creates 6 routines in  0.176 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses/P1_gg_t1t1x; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/SubProcesses/P1_gg_t1t1x; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
 Hunk #2 succeeded at 215 (offset -12 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1 done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.996s
-user	0m2.690s
-sys	0m0.299s
+real	0m2.968s
+user	0m2.658s
+sys	0m0.309s
 Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
@@ -640,7 +640,7 @@ Code generation completed in 3 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -648,9 +648,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -670,7 +670,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -678,9 +678,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_t1t1/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
index 9025117612..154187e345 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h
index 3802fa57c0..26345d4b43 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
index 5bd3053393..0ddc356e1a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
index 1b3601c86b..4067d77373 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,332 +279,145 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 6 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[2], 2 );
-
-      sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[3], 3 );
-
-      // Amplitude(s) for diagram number 1
-      VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[1] += amp_sv[0];
-      VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
-      jamp_sv[0] += amp_sv[0];
-
-      // *** DIAGRAM 2 OF 6 ***
-
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 6 ***
-
-      // Wavefunction(s) for diagram number 3
-      VSS1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += amp_sv[0];
-
-      // *** DIAGRAM 4 OF 6 ***
 
-      // Wavefunction(s) for diagram number 4
-      VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] );
-
-      // Amplitude(s) for diagram number 4
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += amp_sv[0];
-
-      // *** DIAGRAM 5 OF 6 ***
-
-      // Wavefunction(s) for diagram number 5
-      VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 5
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[1] += amp_sv[0];
-
-      // *** DIAGRAM 6 OF 6 ***
 
-      // Wavefunction(s) for diagram number 6
-      VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] );
-
-      // Amplitude(s) for diagram number 6
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] += amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 6 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-#endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -578,7 +444,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -611,6 +481,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_Msu3 );
     m_masses.push_back( m_pars->mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 };
@@ -651,6 +525,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -753,26 +631,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -780,25 +658,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -811,7 +704,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -821,26 +714,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -851,17 +745,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -898,35 +798,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -936,20 +1018,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -961,17 +1037,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -997,93 +1076,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1125,7 +1174,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1148,7 +1197,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1157,25 +1206,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1185,8 +1240,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1202,11 +1259,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1308,14 +1366,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
index d48c729c48..704925d121 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 7;
     //static const int ncomb = 4; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f
index 28f44ab169..b5d6d679c1 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
index 40fbb596f2..cde448f79e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.h
new file mode 100644
index 0000000000..37e497fa4b
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/diagrams.h
@@ -0,0 +1,193 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 6 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[2], 2 );
+    sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[3], 3 );
+    // Amplitude(s) for diagram number 1
+    VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 6 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 6 ***
+    // Wavefunction(s) for diagram number 3
+    VSS1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 6 ***
+    // Wavefunction(s) for diagram number 4
+    VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] );
+    // Amplitude(s) for diagram number 4
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 4 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 6 ***
+    // Wavefunction(s) for diagram number 5
+    VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 5
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 5 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 6 ***
+    // Wavefunction(s) for diagram number 6
+    VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] );
+    // Amplitude(s) for diagram number 6
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 6 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f
index 3fc552a31d..25dc37ef1e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
index 1a1830b77a..60a896f60b 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -295,7 +295,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -338,7 +338,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -383,23 +384,31 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WSU3.NE.0D0) FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3),
-     $    ABS(MDL_MSU3*SMALL_WIDTH_TREATMENT)), MDL_WSU3)
-        IF(MDL_WSU6.NE.0D0) FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6),
-     $    ABS(MDL_MSU6*SMALL_WIDTH_TREATMENT)), MDL_WSU6)
+        FK_ZERO = 0D0
+        IF(MDL_WSU3.NE.0D0) THEN
+          FK_MDL_WSU3 = SIGN(MAX(ABS(MDL_WSU3), ABS(MDL_MSU3
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WSU3)
+        ELSE
+          FK_MDL_WSU3 = 0D0
+        ENDIF
+
+        IF(MDL_WSU6.NE.0D0) THEN
+          FK_MDL_WSU6 = SIGN(MAX(ABS(MDL_WSU6), ABS(MDL_MSU6
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WSU6)
+        ELSE
+          FK_MDL_WSU6 = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -451,10 +460,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -463,6 +474,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(2)=AMP2(2)+AMP(3)*DCONJG(AMP(3))
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h
index ec627d7759..be48b2942a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/HelAmps_MSSM_SLHA2.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VSS1_0( const fptype allV1[],
           const fptype allS2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VSS1_2( const fptype allV1[],
           const fptype allS3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VSS1_3( const fptype allV1[],
           const fptype allS2[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVSS1_0( const fptype allV1[],
            const fptype allV2[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -937,7 +937,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -962,7 +962,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VSS1_0( const fptype allV1[],
           const fptype allS2[],
@@ -975,7 +975,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) };
@@ -990,7 +990,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VSS1_2( const fptype allV1[],
           const fptype allS3[],
@@ -1003,7 +1003,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* S2 = W_ACCESS::kernelAccess( allS2 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P3[4] = { +cxreal( S3[0] ), +cxreal( S3[1] ), +cximag( S3[1] ), +cximag( S3[0] ) };
@@ -1021,7 +1021,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VSS1_3( const fptype allV1[],
           const fptype allS2[],
@@ -1034,7 +1034,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) };
@@ -1052,7 +1052,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVSS1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1067,7 +1067,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
     const cxtype_sv* S4 = W_ACCESS::kernelAccessConst( allS4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP7 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc
index d596fdf1ec..232fd37777 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h
index 26a532156c..faf4bea26d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/Parameters_MSSM_SLHA2.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -903,7 +903,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -913,14 +913,14 @@ namespace mg5amcCpu
     using namespace Parameters_MSSM_SLHA2_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_90s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 );
-    fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
-    fptype* GC_55s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 );
-    fptype* GC_57s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 );
-    cxtype_sv_ref GC_90s_sv = C_ACCESS::kernelAccess( GC_90s );
-    cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s );
-    cxtype_sv_ref GC_55s_sv = C_ACCESS::kernelAccess( GC_55s );
-    cxtype_sv_ref GC_57s_sv = C_ACCESS::kernelAccess( GC_57s );
+    fptype* GC_90s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 );
+    fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
+    fptype* GC_55s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 );
+    fptype* GC_57s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 );
+    cxtype_sv_ref GC_90s_sv = CD_ACCESS::kernelAccess( GC_90s );
+    cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s );
+    cxtype_sv_ref GC_55s_sv = CD_ACCESS::kernelAccess( GC_55s );
+    cxtype_sv_ref GC_57s_sv = CD_ACCESS::kernelAccess( GC_57s );
     GC_90s_sv = couplings_sv.GC_90;
     GC_6s_sv = couplings_sv.GC_6;
     GC_55s_sv = couplings_sv.GC_55;
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
index 45c009959b..e4054b7ac8 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/CODEGEN_cudacpp_susy_gg_t1t1_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -550,47 +550,47 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t1 t1~ WEIGHTED<=2 @1  
 INFO: Process has 6 diagrams 
-1 processes with 6 diagrams generated in 0.122 s
+1 processes with 6 diagrams generated in 0.114 s
 Total: 1 processes with 6 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t1 t1~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t1 t1~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/. 
 Generated helas calls for 1 subprocesses (6 diagrams) in 0.008 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VSS1 routines[0m
 ALOHA: aloha creates VVSS1 routines[0m
-ALOHA: aloha creates 3 routines in  0.182 s
+ALOHA: aloha creates 3 routines in  0.178 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VSS1
 <class 'aloha.create_aloha.AbstractRoutine'> VVSS1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_t1t1/src/. 
 quit
 
-real	0m1.324s
-user	0m1.250s
-sys	0m0.065s
+real	0m1.300s
+user	0m1.221s
+sys	0m0.067s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h
index 3802fa57c0..26345d4b43 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
index 5bd3053393..0ddc356e1a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
index 1d53b4a535..4067d77373 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,333 +279,145 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
-#endif
-
-      // *** DIAGRAM 1 OF 6 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[2], 2 );
-
-      sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[3], 3 );
-
-      // Amplitude(s) for diagram number 1
-      VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-      VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] += amp_sv[0];
-
-      // *** DIAGRAM 2 OF 6 ***
-
-      // Wavefunction(s) for diagram number 2
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[0] -= cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] += cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 3 OF 6 ***
-
-      // Wavefunction(s) for diagram number 3
-      VSS1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
-      jamp_sv[0] += amp_sv[0];
-
-      // *** DIAGRAM 4 OF 6 ***
-
-      // Wavefunction(s) for diagram number 4
-      VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] );
 
-      // Amplitude(s) for diagram number 4
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += amp_sv[0];
-
-      // *** DIAGRAM 5 OF 6 ***
-
-      // Wavefunction(s) for diagram number 5
-      VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 5
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[1] += amp_sv[0];
-
-      // *** DIAGRAM 6 OF 6 ***
 
-      // Wavefunction(s) for diagram number 6
-      VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] );
-
-      // Amplitude(s) for diagram number 6
-      VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] += amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_t1t1x()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 6 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram4, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram5, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram6, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram4( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram5( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram6( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -579,7 +444,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -612,6 +481,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_Msu3 );
     m_masses.push_back( m_pars->mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_Msu3, (fptype)m_pars->mdl_Wsu3, (fptype)m_pars->mdl_Msu6, (fptype)m_pars->mdl_Wsu6 };
@@ -652,6 +525,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_Msu3 );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -754,26 +631,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -781,25 +658,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -812,7 +704,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -822,26 +714,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -852,17 +745,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -899,35 +798,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -944,13 +1025,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -962,17 +1037,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -998,93 +1076,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
-    }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
-    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
-    {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
     }
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1126,7 +1174,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1149,7 +1197,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1158,25 +1206,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1186,8 +1240,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1203,11 +1259,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1309,14 +1366,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
index d48c729c48..704925d121 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 4; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 6; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 7;
     //static const int ncomb = 4; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagrams.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagrams.h
new file mode 100644
index 0000000000..9b25d7def9
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_t1t1x/diagrams.h
@@ -0,0 +1,194 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 6 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[2], 2 );
+    sxxxxx<M_ACCESS, W_ACCESS>( momenta, +1, w_fp[3], 3 );
+    // Amplitude(s) for diagram number 1
+    VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+    VVSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], w_fp[3], w_fp[2], COUPs[0], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 6 ***
+    // Wavefunction(s) for diagram number 2
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[1], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[3], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 6 ***
+    // Wavefunction(s) for diagram number 3
+    VSS1_2<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram4( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 4 OF 6 ***
+    // Wavefunction(s) for diagram number 4
+    VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[2], COUPs[3], -1.0, cIPD[2], cIPD[3], w_fp[4] );
+    // Amplitude(s) for diagram number 4
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[3], w_fp[4], COUPs[3], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram5( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 5 OF 6 ***
+    // Wavefunction(s) for diagram number 5
+    VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[2], 1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 5
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[4], w_fp[2], COUPs[2], 1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram6( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 6 OF 6 ***
+    // Wavefunction(s) for diagram number 6
+    VSS1_3<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[3], COUPs[3], 1.0, cIPD[2], cIPD[3], w_fp[4] );
+    // Amplitude(s) for diagram number 6
+    VSS1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[1], w_fp[2], w_fp[4], COUPs[3], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) += amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h
index ec627d7759..be48b2942a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/HelAmps_MSSM_SLHA2.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VSS1_0( const fptype allV1[],
           const fptype allS2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VSS1_2( const fptype allV1[],
           const fptype allS3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VSS1_3( const fptype allV1[],
           const fptype allS2[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVSS1_0( const fptype allV1[],
            const fptype allV2[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -937,7 +937,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -962,7 +962,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], S2[6], S3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VSS1_0( const fptype allV1[],
           const fptype allS2[],
@@ -975,7 +975,7 @@ namespace mg5amcCpu
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) };
@@ -990,7 +990,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S2[6]' from the input wavefunctions V1[6], S3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VSS1_2( const fptype allV1[],
           const fptype allS3[],
@@ -1003,7 +1003,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* S2 = W_ACCESS::kernelAccess( allS2 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P3[4] = { +cxreal( S3[0] ), +cxreal( S3[1] ), +cximag( S3[1] ), +cximag( S3[0] ) };
@@ -1021,7 +1021,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'S3[6]' from the input wavefunctions V1[6], S2[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VSS1_3( const fptype allV1[],
           const fptype allS2[],
@@ -1034,7 +1034,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V1 = W_ACCESS::kernelAccessConst( allV1 );
     const cxtype_sv* S2 = W_ACCESS::kernelAccessConst( allS2 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* S3 = W_ACCESS::kernelAccess( allS3 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( S2[0] ), +cxreal( S2[1] ), +cximag( S2[1] ), +cximag( S2[0] ) };
@@ -1052,7 +1052,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions V1[6], V2[6], S3[6], S4[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   VVSS1_0( const fptype allV1[],
            const fptype allV2[],
@@ -1067,7 +1067,7 @@ namespace mg5amcCpu
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* S3 = W_ACCESS::kernelAccessConst( allS3 );
     const cxtype_sv* S4 = W_ACCESS::kernelAccessConst( allS4 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP7 = ( V2[2] * V1[2] - V2[3] * V1[3] - V2[4] * V1[4] - V2[5] * V1[5] );
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc
index d596fdf1ec..232fd37777 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h
index 26a532156c..faf4bea26d 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/Parameters_MSSM_SLHA2.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -903,7 +903,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -913,14 +913,14 @@ namespace mg5amcCpu
     using namespace Parameters_MSSM_SLHA2_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_90s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 );
-    fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
-    fptype* GC_55s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 );
-    fptype* GC_57s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 );
-    cxtype_sv_ref GC_90s_sv = C_ACCESS::kernelAccess( GC_90s );
-    cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s );
-    cxtype_sv_ref GC_55s_sv = C_ACCESS::kernelAccess( GC_55s );
-    cxtype_sv_ref GC_57s_sv = C_ACCESS::kernelAccess( GC_57s );
+    fptype* GC_90s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_90 );
+    fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
+    fptype* GC_55s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_55 );
+    fptype* GC_57s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_57 );
+    cxtype_sv_ref GC_90s_sv = CD_ACCESS::kernelAccess( GC_90s );
+    cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s );
+    cxtype_sv_ref GC_55s_sv = CD_ACCESS::kernelAccess( GC_55s );
+    cxtype_sv_ref GC_57s_sv = CD_ACCESS::kernelAccess( GC_57s );
     GC_90s_sv = couplings_sv.GC_90;
     GC_6s_sv = couplings_sv.GC_6;
     GC_55s_sv = couplings_sv.GC_55;
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
index 9e7dad46ce..f293ba7e7c 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/CODEGEN_mad_susy_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,7 +49,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -550,21 +550,21 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.118 s
+1 processes with 3 diagrams generated in 0.109 s
 Total: 1 processes with 3 diagrams
 output madevent_simd ../TMPOUT/CODEGEN_mad_susy_gg_tt --hel_recycling=False --vector_size=32
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4334][0m [0m
+[1;32mDEBUG:  opt['output_options']['vector_size'] = [0m 32 [1;30m[export_v4.py at line 4166][0m [0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_susy_gg_tt 
 INFO: remove old information in CODEGEN_mad_susy_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt 
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
@@ -576,53 +576,53 @@ FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. 
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1527][0m [0m
-[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1551][0m [0m
-[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1552][0m [0m
+[1;32mDEBUG:  len(subproc_diagrams_for_config) = [0m 3 [1;30m[model_handling.py at line 1640][0m [0m
+[1;32mDEBUG:  iconfig_to_diag = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1664][0m [0m
+[1;32mDEBUG:  diag_to_iconfig = [0m {1: 1, 2: 2, 3: 3} [1;30m[model_handling.py at line 1665][0m [0m
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.007 s
 Wrote files for 10 helas calls in 0.076 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.137 s
+ALOHA: aloha creates 2 routines in  0.132 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.133 s
+ALOHA: aloha creates 4 routines in  0.129 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate jpeg diagrams 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt; patch -p4 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file SubProcesses/makefile
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 263][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 268][0m [0m
+Output to directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
+/data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.854s
-user	0m2.558s
-sys	0m0.284s
+real	0m3.019s
+user	0m2.541s
+sys	0m0.329s
 Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
@@ -636,7 +636,7 @@ Code generation completed in 3 seconds
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -644,9 +644,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -666,7 +666,7 @@ launch in debug mode
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                                    *
+*         VERSION 3.6.3                                    *
 *                                                          *
 *    The MadGraph5_aMC@NLO Development Team - Find us at   *
 *    https://server06.fynu.ucl.ac.be/projects/madgraph     *
@@ -674,9 +674,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_mad_susy_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
index 68b4c46295..07d8d59d1b 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/me5_configuration.txt
@@ -235,7 +235,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
index 25f63a3016..265ec11c03 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/proc_card_mg5.dat
@@ -8,7 +8,7 @@
 #*                *                       *                 *
 #*                                                          *
 #*                                                          *
-#*         VERSION 3.6.0                 2024-09-30         *
+#*         VERSION 3.6.3                 2025-06-12         *
 [1;31m#*                                                          *[1;0m
 [1;31m#*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m#*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat
index 6b82577032..000832aacd 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat
index b8db871c35..85e1d39035 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/susy_gg_tt.mad/Cards/run_card_default.dat
@@ -107,6 +107,7 @@
 # Parton level cuts definition *
 #*******************************
   0.0  = dsqrt_shat ! minimal shat for full process
+  -1  = dsqrt_shatmax ! maximum shat for full process
 #                                                                    
 #
 #*********************************************************************
diff --git a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/MGMEVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts
index de3864242b..56ba259c56 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/.make_opts
@@ -102,6 +102,7 @@ endif
 ifneq ($(lhapdf),)
   CXXFLAGS += $(shell $(lhapdf) --cppflags)
   alfas_functions=alfas_functions_lhapdf
+  alfas_to_clean=alfas_functions.o
   llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
   ifeq ($(origin CXX),default)
@@ -113,10 +114,11 @@ ifneq ($(lhapdf),)
   endif	     
 else
   alfas_functions=alfas_functions
+  alfas_to_clean=alfas_functions_lhapdf.o
   llhapdf=
 endif
 
 # Helper function to check MG5 version
 define CHECK_MG5AMC_VERSION
 python -c 'import re; from distutils.version import StrictVersion; print StrictVersion("$(MG5AMC_VERSION)") >= StrictVersion("$(1)") if re.match("^[\d\.]+$$","$(MG5AMC_VERSION)") else True;'
-endef
\ No newline at end of file
+endef
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f
index bb69a6384e..84aeff369c 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/alfas_functions.f
@@ -188,6 +188,10 @@ SUBROUTINE NEWTON1(T,A_IN,A_OUT,NLOOP,NF)
            
       A_OUT=A_IN/(1D0+A_IN*B0(NF)*T)
       IF (NLOOP .EQ. 1) RETURN
+      if (1D0+A_IN*B0(NF)*T.le.0d0)THEN
+          A_OUT = 9d98
+          RETURN
+      ENDIF
       A_OUT=A_IN/(1D0+B0(NF)*A_IN*T+C1(NF)*A_IN*LOG(1D0+A_IN*B0(NF)*T))
       IF (A_OUT .LT. 0D0) AS=0.3D0
  30   AS=A_OUT
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc
index 23d099e5f7..a8ccc7420d 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/cuts.inc
@@ -37,7 +37,7 @@ C
       REAL*8 misset,missetmax,ptheavy
       REAL*8 ptllmin,ptllmax
       integer maxjetflavor
-      REAl*8 dsqrt_shat
+      REAl*8 dsqrt_shat,dsqrt_shatmax
 	
       COMMON /to_min_max_cuts/
      &     PTJmax,PTBmax,PTAmax,PTLmax,
@@ -60,7 +60,7 @@ C
      &     ht2max,ht3max,ht4max,
      &     htjmin,htjmax,ihtmin,ihtmax,
      &     misset,missetmax,ptheavy,
-     &     ptllmin,ptllmax,dsqrt_shat,
+     &     ptllmin,ptllmax,dsqrt_shat,dsqrt_shatmax,
      &     maxjetflavor
 
 C
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
index e4b87ee6ad..f10336e42e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/make_opts
@@ -103,6 +103,7 @@ endif
 ifneq ($(lhapdf),)
 CXXFLAGS += $(shell $(lhapdf) --cppflags)
 alfas_functions=alfas_functions_lhapdf
+alfas_to_clean=alfas_functions.o
 llhapdf+= $(shell $(lhapdf) --cflags --libs) -lLHAPDF
 # check if we need to activate c++11 (for lhapdf6.2)
 ifeq ($(origin CXX),default)
@@ -114,6 +115,7 @@ endif
 endif
 else
 alfas_functions=alfas_functions
+alfas_to_clean=alfas_functions_lhapdf.o
 llhapdf=
 endif
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile
index 291ca907ee..87a9e61723 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/makefile
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/makefile
@@ -37,10 +37,12 @@ all: $(LIBRARIES) $(LIBDIR)libdhelas.$(libext) $(LIBDIR)libpdf.$(libext) $(LIBDI
 $(LIBDIR)libdsample.$(libext): $(DSAMPLE)
 	$(call CREATELIB, $@, $^)
 $(LIBDIR)libgeneric.$(libext): $(GENERIC)
+	rm -f $@ 2>/dev/null
 	$(call CREATELIB, $@, $^)
+	rm -f $(alfas_to_clean) 2>/dev/null
 $(LIBDIR)libdhelas.$(libext): DHELAS
 	cd DHELAS; make; cd ..
-$(LIBDIR)libpdf.$(libext): PDF make_opts
+$(LIBDIR)libpdf.$(libext): PDF $(alfas_functions).o
 	cd PDF; make; cd ..
 ifneq (,$(filter edff chff, $(pdlabel1) $(pdlabel2)))
 $(LIBDIR)libgammaUPC.$(libext): PDF/gammaUPC
@@ -73,6 +75,7 @@ $(BINDIR)gensudgrid: $(GENSUDGRID) $(LIBDIR)libpdf.$(libext) $(LIBDIR)libgammaUP
 # Dependencies
 
 dsample.o: DiscreteSampler.o dsample.f genps.inc StringCast.o vector.inc
+pawgraph.o: vector.inc
 DiscreteSampler.o: StringCast.o
 invarients.o: invarients.f genps.inc
 gen_ximprove.o: gen_ximprove.f run_config.inc run_card.inc 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc
index 1a1bc782bd..8bd5f73840 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc
+++ b/epochX/cudacpp/susy_gg_tt.mad/Source/run_card.inc
@@ -88,6 +88,8 @@
 
       DSQRT_SHAT = 0.000000000000000D+00
 
+      DSQRT_SHATMAX = -1
+
       LIMHEL = 0.000000000000000D+00
 
       PTJ = 2.000000000000000D+01
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt
index 084e244cea..1ac53bb4bd 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MGVersion.txt
@@ -1 +1 @@
-3.6.0
\ No newline at end of file
+3.6.3
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
index 3802fa57c0..26345d4b43 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
index 5bd3053393..0ddc356e1a 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
index 5c62f1bfad..d79ea62148 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,297 +279,139 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // *** DIAGRAM 1 OF 3 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 3 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
 
-      // *** DIAGRAM 3 OF 3 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
-      if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 3 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -555,7 +450,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -588,6 +487,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -628,6 +531,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -730,26 +637,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -757,25 +664,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -788,7 +710,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -798,26 +720,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -828,17 +751,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -875,35 +804,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -913,20 +1024,14 @@ namespace mg5amcCpu
       // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754)
       constexpr int nprocesses = 1;
       static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" );
-      constexpr int process_id = 1; // code generation source: madevent + cudacpp exporter
+      constexpr int process_id = 1; // code generation source: standalone_cudacpp
       static_assert( process_id == 1, "Assume process_id == 1" );
     }
 
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -938,17 +1043,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -974,93 +1082,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1102,7 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1125,7 +1203,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1134,25 +1212,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1162,8 +1246,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1179,11 +1265,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1285,14 +1372,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
index 24c27005b8..5acfd9f387 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 3;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
index bc9bcfeb9b..ce175a75a8 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig.f
@@ -376,7 +376,7 @@ SUBROUTINE DSIG_VEC(ALL_P,ALL_WGT,ALL_XBK,ALL_Q2FACT,ALL_CM_RAP
       DOUBLE PRECISION FUNCTION DSIG(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
index db3c284caa..bd3d520785 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/auto_dsig1.f
@@ -1,7 +1,7 @@
       DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -137,14 +137,14 @@ DOUBLE PRECISION FUNCTION DSIG1(PP,WGT,IMODE)
           ENDDO
           QSCALE=QSCALE/2D0
         ELSE
-          QSCALE=DSQRT(Q2FACT(IB(1)))
+          QSCALE=DSQRT(Q2FACT(1))
         ENDIF
         G1=PDG2PDF(LPP(IB(1)),0, IB(1),XBK(IB(1)), QSCALE)
       ENDIF
       IF (ABS(LPP(IB(2))).GE.1) THEN
 C       LP=SIGN(1,LPP(IB(2)))
         IF (DSQRT(Q2FACT(IB(2))).NE.0D0) THEN
-          QSCALE=DSQRT(Q2FACT(IB(2)))
+          QSCALE=DSQRT(Q2FACT(2))
         ENDIF
         G2=PDG2PDF(LPP(IB(2)),0, IB(2),XBK(IB(2)), QSCALE)
       ENDIF
@@ -219,7 +219,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
      $  ICONF_VEC, IMIRROR_VEC, VECSIZE_USED)
 C     ****************************************************
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -290,6 +290,10 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 
       INTEGER I_EE
 C     
+C     STUFF FOR UPC
+C     
+      DOUBLE PRECISION PHOTONPDFSQUARE
+C     
 C     EXTERNAL FUNCTIONS
 C     
       LOGICAL PASSCUTS
@@ -373,12 +377,12 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
           IF (ABS(LPP(IB(1))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(1)))
             G1(IVEC)=PDG2PDF(LPP(IB(1)),0, IB(1),ALL_XBK(IB(1),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(1), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(1, IVEC)))
           ENDIF
           IF (ABS(LPP(IB(2))).GE.1) THEN
 C           LP=SIGN(1,LPP(IB(2)))
             G2(IVEC)=PDG2PDF(LPP(IB(2)),0, IB(2),ALL_XBK(IB(2),IVEC)
-     $       ,DSQRT(ALL_Q2FACT(IB(2), IVEC)))
+     $       ,DSQRT(ALL_Q2FACT(2, IVEC)))
           ENDIF
         ENDDO  ! IWARP LOOP
       ENDDO  ! CURRWARP LOOP
@@ -484,6 +488,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
 C       Generate events only if IMODE is 0.
         IF(IMODE.EQ.0.AND.DABS(ALL_OUT(IVEC)).GT.0D0)THEN
 C         Call UNWGT to unweight and store events
+          ICONFIG = CHANNELS(IVEC)
           CALL UNWGT(ALL_PP(0,1,IVEC), ALL_OUT(IVEC)*ALL_WGT(IVEC),1,
      $      SELECTED_HEL(IVEC), SELECTED_COL(IVEC), IVEC)
         ENDIF
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h
new file mode 100644
index 0000000000..e0cad8ec2b
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/diagrams.h
@@ -0,0 +1,109 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 3 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 1 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 3 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 2 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 3 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    if( channelId == 3 ) numerators_sv += cxabs2( amp_sv[0] );
+    if( channelId != 0 ) denominators_sv += cxabs2( amp_sv[0] );
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
index ec5722702a..c08048ad0e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/driver.f
@@ -489,7 +489,6 @@ subroutine open_file_local(lun,filename,fopened)
       fopened=.false.
       tempname=filename 	 
       fine=index(tempname,' ') 	 
-c     fine2=index(path,' ')-1 ! AV remove valgrind "Conditional jump or move depends on uninitialised value(s)"
       if(fine.eq.0) fine=len(tempname)
       open(unit=lun,file=tempname,status='old',ERR=20)
       fopened=.true.
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
index c9610a83ed..d79945f299 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/matrix1.f
@@ -1,7 +1,7 @@
       SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
      $  ICOL)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -307,7 +307,7 @@ SUBROUTINE SMATRIX1(P, RHEL, RCOL, CHANNEL, IVEC, ANS, IHEL,
 
       REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
-C     Generated by MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+C     Generated by MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 C     By the MadGraph5_aMC@NLO Development Team
 C     Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 C     
@@ -350,7 +350,8 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
       INTEGER I,J,M,N
       COMPLEX*16 ZTEMP, TMP_JAMP(0)
-      REAL*8 CF(NCOLOR,NCOLOR)
+      INTEGER CF(NCOLOR*(NCOLOR+1)/2)
+      INTEGER DENOM, CF_INDEX
       COMPLEX*16 AMP(NGRAPHS), JAMP(NCOLOR,NAMPSO)
       COMPLEX*16 W(6,NWAVEFUNCS)
 C     Needed for v4 models
@@ -393,21 +394,24 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 C     
 C     COLOR DATA
 C     
-      DATA (CF(I,  1),I=  1,  2) /5.333333333333333D+00,
-     $ -6.666666666666666D-01/
+      DATA DENOM/3/
+      DATA (CF(I),I=  1,  2) /16,-4/
 C     1 T(1,2,3,4)
-      DATA (CF(I,  2),I=  1,  2) /-6.666666666666666D-01
-     $ ,5.333333333333333D+00/
+      DATA (CF(I),I=  3,  3) /16/
 C     1 T(2,1,3,4)
 C     ----------
 C     BEGIN CODE
 C     ----------
       IF (FIRST) THEN
         FIRST=.FALSE.
-        IF(ZERO.NE.0D0) FK_ZERO = SIGN(MAX(ABS(ZERO), ABS(ZERO
-     $   *SMALL_WIDTH_TREATMENT)), ZERO)
-        IF(MDL_WT.NE.0D0) FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
-     $   *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        FK_ZERO = 0D0
+        IF(MDL_WT.NE.0D0) THEN
+          FK_MDL_WT = SIGN(MAX(ABS(MDL_WT), ABS(MDL_MT
+     $     *SMALL_WIDTH_TREATMENT)), MDL_WT)
+        ELSE
+          FK_MDL_WT = 0D0
+        ENDIF
+
 
         IF(INIT_MODE) THEN
           ZEROAMP_1(:,:) = .TRUE.
@@ -446,10 +450,12 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
 
       MATRIX1 = 0.D0
       DO M = 1, NAMPSO
+        CF_INDEX = 0
         DO I = 1, NCOLOR
           ZTEMP = (0.D0,0.D0)
-          DO J = 1, NCOLOR
-            ZTEMP = ZTEMP + CF(J,I)*JAMP(J,M)
+          DO J = I, NCOLOR
+            CF_INDEX = CF_INDEX + 1
+            ZTEMP = ZTEMP + CF(CF_INDEX)*JAMP(J,M)
           ENDDO
           DO N = 1, NAMPSO
 
@@ -458,6 +464,7 @@ REAL*8 FUNCTION MATRIX1(P,NHEL,IC, IHEL,AMP2, JAMP2, IVEC)
           ENDDO
         ENDDO
       ENDDO
+      MATRIX1 = MATRIX1/DENOM
 
       IF(SDE_STRAT.EQ.1)THEN
         AMP2(1)=AMP2(1)+AMP(1)*DCONJG(AMP(1))
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f
index 9a31ed201d..6f32477b9e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/addmothers.f
@@ -21,7 +21,7 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
       integer icol ! color selected
 
       integer isym(nexternal,99), jsym
-      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,nc,ic
+      integer i,j,k,ida(2),ns,nres,ires,icl,ito2,idenpart,ic
       integer mo_color,da_color(2),itmp
       integer ito(-nexternal+3:nexternal),iseed,maxcolor,maxorg
       integer icolalt(2,-nexternal+2:2*nexternal-3)
@@ -113,14 +113,15 @@ subroutine addmothers(ip,jpart,pb,isym,jsym,rscale,aqcd,aqed,buff,
          endif
          lconfig = vec_igraph1(ivec)
       endif
-      
+      is_LC=.true.
+      maxcolor=0
 c
 c    Choose a color flow which is certain to work with the propagator
 c    structure of the chosen diagram and use that as an alternative
 c   
       if (icol.eq.0) then
       do i=1,nexternal
-	 icolalt(1,i)=0
+         icolalt(1,i)=0
          icolalt(2,i)=0
       enddo
       else
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f
index b8995283ed..907894ea89 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cluster.f
@@ -556,6 +556,8 @@ logical function cluster(p, ivec)
       jwin = 0
       cluster=.false.
       clustered=.false.
+      iwin =0
+      jwin =0
       do i=0,3
         pcmsp(i)=0
       enddo
@@ -665,8 +667,11 @@ logical function cluster(p, ivec)
 c     initialize graph storage
       igraphs(0)=0
       nleft=nexternal
-c     cluster
-      if (iwin.eq.0.or.jwin.eq.0) stop 21
+      if(iwin.eq.0.or.jwin.eq.0)then
+          cluster=.false.
+          return
+      endif
+c     cluster 
       do n=1,nexternal-2
 c     combine winner
          imocl(n)=imap(iwin,2)+imap(jwin,2)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f
index 7898714201..bd50ab1357 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cuts.f
@@ -307,12 +307,18 @@ LOGICAL FUNCTION PASSCUTS(P, VECSIZE_USED)
 c
 c     Limit S_hat
 c
-      if (dsqrt_shat.ne.0d0)then
-         if (nincoming.eq.2.and.sumdot(p(0,1),p(0,2),1d0) .lt. dsqrt_shat**2) then
-            passcuts=.false.
-            return
-         endif
-      endif
+      if(nincoming.eq.2) then
+        if (dsqrt_shat.ne.0d0.or.dsqrt_shatmax.ne.-1d0)then
+            xvar = sumdot(p(0,1),p(0,2),1d0)
+            if (xvar .lt. dsqrt_shat**2)then
+                passcuts=.false.
+                return
+            else if  (dsqrt_shatmax.ne.-1d0 .and. xvar .gt. dsqrt_shatmax**2)then
+                passcuts = .false.
+                return
+            endif
+        endif
+      endif      
 C $B$ DESACTIVATE_CUT $E$ !This is a tag for MadWeight
 
       if(debug) write (*,*) '============================='
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f
index 1c32e93f5d..8503bdbec8 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/genps.f
@@ -1373,6 +1373,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
       double precision smin,smax,spole,swidth,s,jac
       double precision x
       logical pass
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 c
 c     Local
 c     
@@ -1384,6 +1388,10 @@ subroutine gen_s(x,smin,smax,spole,swidth,s,jac,pass)
 c-----
 c  Begin Code
 c-----
+      if (dsqrt_shatmax.ne.-1d0)then
+          smax = min(smax, dsqrt_shatmax**2)
+      endif 
+
       pass=.true.
       if (jac .eq. 0 .and. .not. warned0) then
          print*,'Input jacobian 0 in genps'
@@ -1628,7 +1636,10 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
       DOUBLE PRECISION ETA,ETAMIN,ETAMAX
       logical warned
       data warned/.false./
-
+      include 'maxparticles.inc'
+      include '../../Source/vector.inc'
+      include 'run.inc'
+      include 'cuts.inc'
 C------------
 C  BEGIN CODE
 C------------
@@ -1645,7 +1656,11 @@ SUBROUTINE GENCMS(S,X1,X2,X,SMIN,SJACOBI)
 C     IF THERE IS NO S CHANNEL POLE USE BELOW:
 
       TAUMIN = 0d0 !SMIN/S !keep scale fix
-      TAUMAX = 1D0
+      if (dsqrt_shatmax.ne.-1d0)then
+          TAUMAX=dsqrt_shatmax**2/S
+      else
+        TAUMAX = 1D0
+      endif
       TAU    = (TAUMAX-TAUMIN)*X(1)+TAUMIN
       SJACOBI=  sjacobi*(TAUMAX-TAUMIN)
 
@@ -1915,7 +1930,7 @@ double precision function get_channel_cut(p, config)
             if(sde_strat.eq.2)then
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
-               get_channel_cut = get_channel_cut / ((t-Mass)*(t+Mass)+stot*1d-10)**2
+               get_channel_cut = get_channel_cut / (t-Mass**2+stot*1d-10)**2
             endif
 c            write(*,*) i, "t, Mass, fact", t, Mass, ((t-Mass)*(t+Mass))**2,get_channel_cut
             t = t/stot 
@@ -1930,9 +1945,9 @@ double precision function get_channel_cut(p, config)
                t = dot(ptemp(0,-i), ptemp(0,-i))
                Mass  = prmass(-i, config)
                Width = prwidth(-i, config)
-               tmp = (t-Mass)*(t+Mass)
+               tmp = (t-Mass**2)
                tmp2 = Mass*Width
-               get_channel_cut = get_channel_cut* (tmp**2 - tmp2**2)/(tmp**2 + tmp2**2)**2 
+               get_channel_cut = get_channel_cut/(tmp**2 + tmp2**2) 
             endif
 c            write(*,*) i, "s, Mass, Width, fact", t, Mass, Width, (((t-Mass)*(t+Mass) )**2 + Width**2*Mass**2), get_channel_cut
          endif
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f
index 9e5f8d44dd..5360566ef4 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/myamp.f
@@ -231,6 +231,7 @@ subroutine set_peaks
       double precision x1,x2,xk(nexternal)
       double precision dr,mtot,etot,xqfact
       double precision spmass
+      double precision stot ! technically the min with dsqrt_shatmax**2 with the physical one
       integer i, iconfig, l1, l2, j, nt, nbw, iproc, k
       integer iden_part(-nexternal+1:nexternal)
 
@@ -285,8 +286,8 @@ subroutine set_peaks
       integer        lbw(0:nexternal)  !Use of B.W.
       common /to_BW/ lbw
 
-      double precision stot,m1,m2
-      common/to_stot/stot,m1,m2
+      double precision real_stot,m1,m2
+      common/to_stot/real_stot,m1,m2
 
       include 'coupl.inc' ! needs VECSIZE_MEMMAX (defined in vector.inc)
       include 'cuts.inc'
@@ -309,6 +310,12 @@ subroutine set_peaks
 c-----
 c  Begin Code
 c-----     
+      if (dsqrt_shatmax.ne.-1)then
+        stot = min(real_stot, dsqrt_shatmax**2)
+      else
+        stot = real_stot
+      endif
+
       iconfig = this_config
 c     needs to be initialise to avoid segfault
       do i = -nexternal,-1
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f
index 0a0bafa7c1..189ba41449 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/reweight.f
@@ -1186,7 +1186,7 @@ logical function setclscales(p, keepq2bck, ivec)
             if(nexternal.gt.3) pt2ijcl(nexternal-3)=q2fact(2)
          else
             if(.not.fixed_fac_scale1) q2fact(1)=scalefact**2*pt2ijcl(nexternal-2)
-            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*q2fact(1)
+            if(.not.fixed_fac_scale2) q2fact(2)=scalefact**2*pt2ijcl(nexternal-2)
          endif
       elseif(jcentral(1).eq.0)then
             if(.not.fixed_fac_scale1)  q2fact(1) = scalefact**2*pt2ijcl(jfirst(1))
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f
index 309540a0a2..2afd9e9f75 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/symmetry.f
@@ -232,7 +232,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
 c               write(*,*) 'mapping',ic,mapconfig(i),icode               
                if (icode .eq. 0) then
 c                 Create format string based on number of digits
-                  write(formstr,'(a,i1,a)') '(I',nconf,'$)'
+                  write(formstr,'(a,i1,a)') '(I',nconf,',$)'
                   write(*,formstr) mapconfig(i)
 c                 Write symmetry factors
                   write(formstr2,'(a,i2,a)') '(2i',nsym,')'
@@ -242,10 +242,10 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   if(nconf+ncode+1.lt.10) then
                      write(formstr,'(a,i1,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   else
                      write(formstr,'(a,i2,a,i1,a)') '(F',nconf+ncode+1,
-     $                    '.',ncode,'$)'
+     $                    '.',ncode,',$)'
                   endif
                   write(*,formstr) dconfig
 c                 Write symmetry factors
@@ -260,7 +260,7 @@ subroutine write_bash(mapconfig,use_config, prwidth, jcomp,iforest,
                   dconfig=mapconfig(i)+icode*1d0/10**ncode
                   write(27,formstr2) dconfig,use_config(i)
                endif
-               write(*,'(a$)') ' '
+               write(*,'(a,$)') ' '
  100           call bw_increment_array(iarray,imax,ibase,done)
             enddo
          else
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f
index f602511c94..d1247f1849 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/unwgt.f
@@ -497,6 +497,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer ip, np, ic, nc
       integer ida(2),ito(-nexternal+3:nexternal),ns,nres,ires,icloop
       integer iseed
+      double precision beam_mass
       double precision pboost(0:3)
       double precision beta, get_betaz
       double precision ebi(0:3), ebo(0:3)
@@ -506,7 +507,7 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       integer idup(nexternal,maxproc,maxsproc)
       integer mothup(2,nexternal)
       integer icolup(2,nexternal,maxflow,maxsproc)
-
+      double precision eta
       integer nsym
 
       integer ievent
@@ -638,21 +639,20 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
       if (nincoming.eq.2) then
          if (xbk(1) .gt. 0d0 .and. xbk(1) .le. 1d0 .and.
      $       xbk(2) .gt. 0d0 .and. xbk(2) .le. 1d0) then
-           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0).and.xbk(2).ne.1d0) then
-               ! construct the beam momenta in each frame and compute the related (z)boost
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4).and.ebeam(1).gt.10d0*m1)then
-                  local_mass = 0d0
-              else
-                  local_mass = m1
-              endif
+           if(lpp(2).ne.0.and.(xbk(1).eq.1d0.or.pmass(1).eq.0d0)) then
+                if((abs(lpp(1)).gt.2.and.abs(lpp(1)).ne.9).or.xbk(1).eq.1d0)then
+                    beam_mass = pmass(1)
+                else
+                    beam_mass = m1
+                endif   
                ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(1)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(1).eq.1d0) then
                 pb(0,isym(1,jsym)) = ebo(0)
@@ -668,20 +668,19 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
                enddo
 
             else
-               if (pmass(1).eq.0d0.and.(abs(lpp(1)).eq.3.or.abs(lpp(1)).eq.4.and.ebeam(2).gt.10d0*m2))then
-                  local_mass = 0d0
-              else
-                  local_mass = m2
-              endif
-               ebi(0) = p(0,1)/xbk(1) ! this assumes that particle 1 is massless or mass equal to beam
+                if((abs(lpp(2)).gt.2.and.abs(lpp(2)).ne.9).or.xbk(2).eq.1d0)then
+                    beam_mass = pmass(2)
+                else
+                    beam_mass = m2
+                endif   
                ebi(0) = p(0,2)/xbk(2) ! this assumes that particle 2 is massless or mass equal to beam
                ebi(1) = 0
                ebi(2) = 0
-               ebi(3) = -1d0*DSQRT(ebi(0)**2-local_mass**2)
+               ebi(3) = -1d0*DSQRT(ebi(0)**2-beam_mass**2)
                ebo(0) = ebeam(2)
                ebo(1) = 0
                ebo(2) = 0
-               ebo(3) = -1d0*DSQRT(ebo(0)**2-local_mass**2)
+               ebo(3) = -1d0*DSQRT(ebo(0)**2-beam_mass**2)
                beta = get_betaz(ebi, ebo)
                if (xbk(2).eq.1d0) then
                 pb(0,isym(2,jsym)) = ebo(0)
@@ -701,6 +700,21 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
             write(*,*) 'Warning bad x1 or x2 in write_leshouche',
      $           xbk(1),xbk(2)
          endif
+         do j=1,nexternal
+            call zboost_with_beta(p(0,j),beta,pb(0,isym(j,jsym)))
+            pb(4,isym(j,jsym))=pmass(j)
+         enddo
+
+         ! check for numerical_accuracy
+         if (pb(0,1).gt.ebeam(1).or.pb(0,2).gt.ebeam(2))then
+            ! go back to old method --more accurate when boosting with xbk close  to one-- 
+            eta = sqrt(xbk(1)*ebeam(1)/(xbk(2)*ebeam(2)))
+            pboost(0)=p(0,1)*(eta + 1d0/eta)
+            pboost(3)=p(0,1)*(eta - 1d0/eta)
+            do j=1,nexternal
+               call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
+            enddo
+          endif
       else
          do j=1,nexternal
             call boostx(p(0,j),pboost,pb(0,isym(j,jsym)))
@@ -709,6 +723,8 @@ SUBROUTINE write_leshouche(p,wgt,numproc,do_write_events, ihel, icol, ivec)
          enddo
       endif
 
+
+
       if (IMIRROR.eq.2.and.pmass(1).ne.pmass(2)) then
 c        Note that in this context isym(1,jsym) should never be "2" since the mass differ 
          pb(4,isym(1,jsym))=pmass(2)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun
index 8c8f7d3940..01d4ab53f5 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/gridrun
@@ -91,7 +91,7 @@ import internal.madevent_interface as cmd_interface
 
 
 try:
-    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2])            
+    cmd_line = cmd_interface.GridPackCmd(me_dir=root_path, nb_event=args[0], seed=args[1], gran=args[2], nprocs=args[3], maxevts=args[4])
 except KeyboardInterrupt:
     print('Quit on KeyboardInterrupt') 
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh
index 20adf572c2..2d149f96be 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/Gridpack/run.sh
@@ -14,6 +14,18 @@
 # USAGE : run [num_events] [iseed]                                         ##
 #############################################################################
 
+function usage() {
+    local retcode="${1:-1}"  # default return code is 1
+    echo "Usage:"
+    echo "  run.sh [options] [num events] [seed]"
+    echo "  run.sh [options] [num events] [seed] [granularity]"
+    echo "Options:"
+    echo "  -h, --help                  print this message and exit"
+    echo "  -p, --parallel [num procs]  number of processes to run in parallel"
+    echo "  -m, --maxevts [num events]  maximum number of unweighted events per job"
+    exit $retcode
+}
+
 if [[ -d ./madevent ]]; then
     DIR='./madevent'
 else
@@ -32,23 +44,46 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 # For Mac OS X
 export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:${PWD}/madevent/lib:${PWD}/HELAS/lib
 
+pos_args=()
+nprocs=1
+maxevts=2500 
 
-if [[  ($1 != "") && ("$2" != "") && ("$3" == "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=1
-elif [[  ($1 != "") && ("$2" != "") && ("$3" != "") ]]; then
-   num_events=$1
-   seed=$2
-   gran=$3
-else
-   echo "Warning: input is not correct. script requires two arguments: NB_EVENT SEED"
-fi
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    -h|--help)
+      usage 0 ;;
+    -p|--parallel)
+      nprocs="$2" && shift && shift ;;
+    -m|--maxevts)
+      maxevts="$2" && shift && shift ;;
+    -*)
+      echo "Error: Unknown option $1" && usage ;;
+    *)
+      pos_args+=("$1") && shift ;;
+  esac
+done
+
+case `echo "${pos_args[@]}" | wc -w | tr -d " "`  in
+    "2")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=1
+      ;;
+    "3")
+      num_events=${pos_args[0]}
+      seed=${pos_args[1]}
+      gran=${pos_args[2]}
+      ;;
+    *)
+      echo "Error: number of arguments is not correct"
+      usage
+      ;;
+esac
 
-echo "Now generating $num_events events with random seed $seed and granularity $gran"
+echo "Now generating $num_events events with random seed $seed and granularity $gran using $nprocs processes"
 
 ############    RUN THE PYTHON CODE #####################
-${DIR}/bin/gridrun $num_events $seed $gran
+${DIR}/bin/gridrun $num_events $seed $gran $nprocs $maxevts
 ########################################################
 
 ###########    POSTPROCESSING      #####################
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py
index 42d82818d0..2efb5954a6 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/banner.py
@@ -353,7 +353,7 @@ def modify_init_cross(self, cross, allow_zero=False):
         assert "init" in self
         
         cross = dict(cross)
-        for key in cross.keys():
+        for key in list(cross.keys()):
             if isinstance(key, str) and key.isdigit() and int(key) not in cross:
                 cross[int(key)] = cross[key]
         
@@ -1991,6 +1991,11 @@ def default_setup(self):
         self.add_param("PartonLevel:FSRinResonances", True, hidden=True, always_write_to_card=False, comment="Do not allow shower to run from decay product of unstable particle")
         self.add_param("ProcessLevel:resonanceDecays", True, hidden=True, always_write_to_card=False, comment="Do not allow unstable particle to decay.")
 
+        # Parameters only needed for main164 type of run (not pythia8/MG5 interface)
+        self.add_param("Main:HepMC", True, hidden=True, always_write_to_card=False,
+                       comment="""Specify the type of output to be used by the main164 run. """)
+        self.add_param("HepMC:output", 'hepmc.gz', hidden=True, always_write_to_card=False,
+                       comment="Specify the HepMC output file to be used by the main164 run.")
         # Add parameters controlling the subruns execution flow.
         # These parameters should not be part of PY8SubRun daughter.
         self.add_default_subruns('parameters')
@@ -2087,8 +2092,10 @@ def MadGraphSet(self, name, value, **opts):
             force = False
         if name.lower() not in self or (force or name.lower() not in self.user_set):
             self.__setitem__(name, value, change_userdefine=False, **opts)
-            self.system_set.add(name.lower())            
-    
+            self.system_set.add(name.lower())  
+        else:
+            raise Exception("The parameter %s is already set to %s. You can not change it." % (name, self[name]))          
+
     def defaultSet(self, name, value, **opts):
             self.__setitem__(name, value, change_userdefine=False, **opts)
         
@@ -2144,9 +2151,19 @@ def pythia8_formatting(value, formatv=None):
             else:
                 return ','.join([PY8Card.pythia8_formatting(arg) for arg in value])
             
+    #change of name convention between MG5 old interface and main164 from Pythia8
+    interface_to_164 = {'HEPMCoutput:file': 'HepMC:output',
+                        'SysCalc:fullCutVariation': '!SysCalc:fullCutVariation (not supported with 164)',
+                        'SysCalc:qCutList': '!SysCalc:qCutList (not supported with 164)',
+                        'SysCalc:qWeed': '!SysCalc:qWeed (not supported with 164)',
+                        'SysCalc:tmsList': '!SysCalc:tmsList (not supported with 164)',
+                        'HEPMCoutput:scaling' : '!HEPMCoutput :scaling (not supported with 164)',
+                        'LHEFInputs:nSubruns' : 'Main:numberOfSubruns'}
+
 
     def write(self, output_file, template, read_subrun=False, 
-                    print_only_visible=False, direct_pythia_input=False, add_missing=True):
+                    print_only_visible=False, direct_pythia_input=False, add_missing=True,
+                    use_mg5amc_py8_interface=False):
         """ Write the card to output_file using a specific template.
         > 'print_only_visible' specifies whether or not the hidden parameters
             should be written out if they are in the hidden_params_to_always_write
@@ -2155,7 +2172,12 @@ def write(self, output_file, template, read_subrun=False,
           in the self.visible_params_to_always_write list and are not user_set
           or system_set are commented.
         > If 'add_missing' is False then parameters that should be written_out but are absent
-        from the template will not be written out."""
+        from the template will not be written out.
+        > use_mg5amc_py8_interface is a flag to indicate that the MG5aMC-PY8 interface is used or not
+          if not used some parameters need to be translated from the old convention to the new one
+        """
+
+        self.use_mg5amc_py8_interface = use_mg5amc_py8_interface
 
         # First list the visible parameters
         visible_param = [p for p in self if p.lower() not in self.hidden_param
@@ -2297,7 +2319,16 @@ def group_params(params):
             else:
                 # Just copy parameters which don't need to be specified
                 if param.lower() not in self.params_to_never_write:
-                    output.write(line)
+
+                    if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                        param_entry = self.interface_to_164[param.strip()]
+                        # special case for HepMC needs two flags
+                        if 'HepMC:output' == param_entry:
+                            output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                        output.write('%s=%s\n'%(param_entry,new_value))
+                    else:
+                        output.write(line)
                 else:
                     output.write('! The following parameter was forced to be commented out by MG5aMC.\n')
                     output.write('! %s'%line)
@@ -2313,6 +2344,7 @@ def group_params(params):
             if ((not direct_pythia_input) or
                   (param.lower() in self.visible_params_to_always_write) or
                   (param.lower() in self.user_set) or
+                  (param.lower() in self.hidden_params_to_always_write) or
                   (param.lower() in self.system_set)):
                 template = '%s=%s'
             else:
@@ -2321,6 +2353,19 @@ def group_params(params):
                 # then they shouldn't be passed to Pythia
                 template = '!%s=%s'
 
+            if not use_mg5amc_py8_interface and direct_pythia_input and \
+                                   param in self.interface_to_164:
+                param_entry = self.interface_to_164[param]
+                # special case for HepMC needs two flags
+                if 'HepMC:output' == param_entry:
+                    output.write(' %s=%s\n'%('Main:HepMC', 'on'))
+                    if 'Main:InternalAnalysis'.lower() in self.user_set and \
+                        self['Main:InternalAnalysis'].lower() == 'on':
+                        output.write('InternalAnalysis:output = ./djrs.dat\n')
+
+            #elif param in self.interface_to_164.values() and not direct_pythia_input:
+            #    misc.sprint(use_mg5amc_py8_interface, direct_pythia_input,param)
+            #    raise Exception('The parameter %s is not supported in the MG5aMC-PY8 interface. Please use the new interface.'%param_entry
             output.write(template%(param_entry,
                                   value_entry.replace(value,new_value)))
         
@@ -2365,6 +2410,8 @@ def group_params(params):
                 comment = '\n'.join('! %s'%c for c in 
                           self.comments[param.lower()].split('\n'))
                 output.write(comment+'\n')
+            if not use_mg5amc_py8_interface and param in self.interface_to_164:
+                continue
             output.write('%s=%s\n'%(param,PY8Card.pythia8_formatting(self[param])))
         
         # Don't close the file if we were reading a subrun, but simply write 
@@ -3318,7 +3365,6 @@ def retro_compatible_custom_fct(lines, mode=None):
             for i,line in enumerate(lines[:]):
                 if search and re.search(include_pat, line):
                     name = re.findall(include_pat, line)[0]
-                    misc.sprint('DETECTED INCLUDE', name)
                     if 'vector.inc' in name:
                         search = False
                     if 'run.inc' in name:
@@ -3326,7 +3372,6 @@ def retro_compatible_custom_fct(lines, mode=None):
                         search = False
                 sol.append(line)
                 if re.search(function_pat, line):
-                    misc.sprint("DETECTED FCT")
                     search = True
         return sol
 
@@ -4050,8 +4095,8 @@ def post_set_fixed_fac_scale(card, value, change_userdefine, raiseerror, **opt):
         if 'fixed_fac_scale2' in card.user_set:
             card.user_set.remove('fixed_fac_scale2')
 
-        # #card['pdlabel1'] = value
-        # #card['pdlabel2'] = value
+        dict.__setitem__(card, 'fixed_fac_scale1', card['fixed_fac_scale'])
+        dict.__setitem__(card, 'fixed_fac_scale2', card['fixed_fac_scale'])
 
     @staticmethod
     def post_set(card, value, change_userdefine, raiseerror, name='unknown', **opt):
@@ -4201,6 +4246,7 @@ def default_setup(self):
         self.add_param("bwcutoff", 15.0)
         self.add_param("cut_decays", False, cut='d')
         self.add_param('dsqrt_shat',0., cut=True)
+        self.add_param('dsqrt_shatmax', -1, cut=True) 
         self.add_param("nhel", 0, include=False)
         self.add_param("limhel", 1e-8, hidden=True, comment="threshold to determine if an helicity contributes when not MC over helicity.")
         #pt cut
@@ -4451,11 +4497,11 @@ def check_validity(self):
                 time.sleep(5)
             if self['drjj'] != 0:
                 if 'drjj' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjj\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjj\' to 0')
                 self['drjj'] = 0
             if self['drjl'] != 0:
                 if 'drjl' in self.user_set:
-                    logger.warning('Since icckw>0, changing the value of \'drjl\' to 0')
+                    logger.warning('Since ickkw>0, changing the value of \'drjl\' to 0')
                 self['drjl'] = 0    
             if not self['auto_ptj_mjj']:         
                 if self['mmjj'] > self['xqcut']:
@@ -4753,7 +4799,6 @@ def create_default_for_process(self, proc_characteristic, history, proc_def):
                 self['fixed_fac_scale1'] = True
                 self['nhel']    = 1
                 for i in beam_id_split[1]:
-                    exit
                     if abs(i) == 11:
                         self['lpp1']    = -math.copysign(3,i)
                         self['lpp2']    =  math.copysign(3,i)
@@ -5577,6 +5622,9 @@ def default_setup(self):
 
         #technical
         self.add_param('folding', [1,1,1], include=False)
+
+        #bias
+        self.add_param('flavour_bias',[5,1], hidden=True, comment="Example: '5,100' means that the probability to generate an event with a bottom (or anti-bottom) quark is increased by a factor 100, but the weight of those events is reduced by a factor 100. Requires that the 'event_norm' is set to 'bias'.")
         
         #merging
         self.add_param('ickkw', 0, allowed=[-1,0,3,4], comment=" - 0: No merging\n - 3:  FxFx Merging :  http://amcatnlo.cern.ch/FxFx_merging.htm\n - 4: UNLOPS merging (No interface within MG5aMC)\n - -1:  NNLL+NLO jet-veto computation. See arxiv:1412.8408 [hep-ph]")
@@ -5790,6 +5838,17 @@ def check_validity(self):
         if self['mcatnlo_delta'] and not self['parton_shower'].lower() == 'pythia8':
             raise InvalidRunCard("MC@NLO-DELTA only possible with matching to Pythia8")
 
+    # check that the flavour_bias is consistent
+        if len(self['flavour_bias']) != 2:
+            raise InvalidRunCard("'flavour_bias' should contain exactly two numbers: the abs(PDG) of the flavour to enhance, and the enhancement multiplication factor.")
+        for i in self['flavour_bias']:
+            if i < 0:
+                raise InvalidRunCard("flavour and multiplication factor should be positive in the flavour_bias parameter")
+        if self['flavour_bias'][1] != 1 and self['event_norm'] != 'bias':
+            logger.warning('Non-trivial flavour enhancement factor: setting event normalisation to "bias"')
+            self['event_norm']='bias'
+            
+    
         # check that ebeam is bigger than the proton mass.
         for i in [1,2]:
             # do not for proton mass if not proton PDF (or when scan initialization)
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py
index bc785b5de6..a34705f6bc 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/check_param_card.py
@@ -1092,11 +1092,11 @@ def write_summary(self, path, order=None, lastline=False, nbcol=20):
             to_print = self.cross[-1:]
         for info in to_print:
             name = info['run_name']
-            bench = info['bench']
+            bench = [float(x) for x in info['bench']]
             data = []
             for k in keys:
                 if k in info:
-                    data.append(info[k])
+                    data.append(float(info[k]))
                 else:
                     data.append(0.)
             ff.write(formatting % tuple([name] + bench + data))
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py
index 95ef45b5f3..66069293d2 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/cluster.py
@@ -602,6 +602,7 @@ def __init__(self, *args, **opt):
         self.submitted = six.moves.queue.Queue() # one entry by job submitted
         self.stoprequest = threading.Event() #flag to ensure everything to close
         self.demons = []
+        self.gpus_list = []
         self.nb_done =0
         if 'nb_core' in opt:
             self.nb_core = opt['nb_core']
@@ -623,23 +624,46 @@ def __init__(self, *args, **opt):
         self.done_pid_queue = six.moves.queue.Queue()
         self.fail_msg = None
 
+        mg5_gpu_env_str = 'MG5_GPU_VISIBLE_DEVICES'
+        gpu_variables = [['NVIDIA_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES'],
+                         ['ROCR_VISIBLE_DEVICES', 'HIP_VISIBLE_DEVICES'],]
+        if mg5_gpu_env_str in os.environ:
+            new_var = os.environ[mg5_gpu_env_str].split(',')
+            if len(new_var) == 2:
+                gpu_variables.insert(0, new_var)
+            else:
+                logger.error('Invalid format for %s=%s, it should be a comma-separated list of two elements' % (mg5_gpu_env_str, os.environ[mg5_gpu_env_str]))
 
+        for get_var,set_var in gpu_variables:
+            if get_var in os.environ:
+                self.gpus_list = os.environ.get(get_var).split(',')
+                self.gpu_set_var = set_var
+                self.gpus_count = len(self.gpus_list)
+                logger.info('Found %s GPUs: %s' % (self.gpus_count, self.gpus_list))
         
     def start_demon(self):
         import threading
-        t = threading.Thread(target=self.worker)
+        env2 = None
+        if len(self.gpus_list):
+            env2 = os.environ.copy()
+            this_gpu_idx = len(self.demons) % self.gpus_count
+            env2[self.gpu_set_var] = self.gpus_list[this_gpu_idx]
+            t = threading.Thread(target=self.worker, kwargs={'env2': env2})
+        else:
+            t = threading.Thread(target=self.worker)
         t.daemon = True
         t.start()
         self.demons.append(t)
 
 
-    def worker(self):
+    def worker(self, env2=None):
         import six.moves.queue
         import six.moves._thread
         while not self.stoprequest.isSet():
             try:
                 args = self.queue.get(timeout=10)
                 tag, exe, arg, opt = args
+                opt['env'] = env2
                 try:
                     # check for executable case
                     if isinstance(exe,str):
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py
index 9ff7390cf5..69291df0d4 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/common_run_interface.py
@@ -750,8 +750,8 @@ def __init__(self, me_dir, options, *args, **opts):
         else:
             self.ninitial = self.proc_characteristics['ninitial']
 
-    def make_make_all_html_results(self, folder_names = [], jobs=[]):
-        return sum_html.make_all_html_results(self, folder_names, jobs)
+    def make_make_all_html_results(self, folder_names = [], jobs=[], get_attr=None):
+        return sum_html.make_all_html_results(self, folder_names, jobs, get_attr)
 
 
     def write_RunWeb(self, me_dir):
@@ -1463,11 +1463,15 @@ def create_plot(self, mode='parton', event_path=None, output=None, tag=None):
                                              self.run_name, '%s_pts.dat' % tag)
                 for observable_name, data_path in [('djr',djr_path),
                                                    ('pt',pt_path)]:
-                    if not self.generate_Pythia8_HwU_plots(
+                    try:
+                        if not self.generate_Pythia8_HwU_plots(
                                     PY8_plots_root_path, merging_scale_name,
                                                      observable_name,data_path):
-                        return False
-
+                            return False
+                    except Exception as error:
+                        if os.path.exists(data_path):
+                            logger.info('plot information present in %s' % data_path)
+                        return True
         if mode == 'Pythia8':
             plot_files = glob.glob(pjoin(PY8_plots_root_path,'*.gnuplot'))
             if not misc.which('gnuplot'):
@@ -1964,12 +1968,16 @@ def do_systematics(self, line):
                 self.cluster.wait(os.path.dirname(output), update_status, update_first=update_status)
             except Exception:
                 self.cluster.remove()
+                for i in range(nb_submit):
+                    os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
                 old_run_mode = self.options['run_mode']
                 self.options['run_mode'] =0
+                out =False
                 try:
                     out = self.do_systematics(line)
                 finally:
                     self.options['run_mode']  =  old_run_mode
+                return out
             #collect the data
             all_cross = []
             for i in range(nb_submit):
@@ -1995,18 +2003,21 @@ def do_systematics(self, line):
                                        self.run_card['event_norm'] in ['unity']:
                 all_cross= [cross/nb_event for cross in all_cross]
                 
-            sys_obj = systematics.call_systematics([input, None] + opts, 
-                                         log=lambda x: logger.info(str(x)),
-                                         result=result_file,
-                                         running=False
-                                         )                    
+
+            sys_obj = systematics.call_systematics([input, None] + opts,
+                                        log=lambda x: logger.info(str(x)),
+                                        result=result_file,
+                                        running=False
+                                        )
+
             sys_obj.print_cross_sections(all_cross, nb_event, result_file)
-            
+
             #concatenate the output file
             subprocess.call(['cat']+\
                             ['./tmp_%s_%s' % (i, os.path.basename(output)) for i in range(nb_submit)],
                             stdout=open(output,'w'),
                             cwd=os.path.dirname(output))
+                
             for i in range(nb_submit):
                 os.remove('%s/tmp_%s_%s' %(os.path.dirname(output),i,os.path.basename(output)))
             #    os.remove('%s/log_sys_%s.txt' % (os.path.dirname(output),i))
@@ -3831,7 +3842,7 @@ def store_scan_result(self):
         """return the information that need to be kept for the scan summary.
         Auto-width are automatically added."""
         
-        return {'cross': self.results.current['cross']}
+        return {'cross': self.results.current['cross'], 'error': self.results.current['error']}
 
 
     def add_error_log_in_html(self, errortype=None):
@@ -5135,10 +5146,10 @@ def init_run(self, cards):
             self.special_shortcut.update(
                 {'ebeam':([float],['run_card ebeam1 %(0)s', 'run_card ebeam2 %(0)s']),
                 'lpp': ([int],['run_card lpp1 %(0)s', 'run_card lpp2 %(0)s' ]),
-                'lhc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lhc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'lep': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
                 'ilc': ([int],['run_card lpp1 0', 'run_card lpp2 0', 'run_card ebeam1 %(0)s/2', 'run_card ebeam2 %(0)s/2']),
-                'lcc': ([int],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
+                'lcc': ([float],['run_card lpp1 1', 'run_card lpp2 1', 'run_card ebeam1 %(0)s*1000/2', 'run_card ebeam2 %(0)s*1000/2']),
                 'fixed_scale': ([float],['run_card fixed_fac_scale T', 'run_card fixed_ren_scale T', 'run_card scale %(0)s', 'run_card dsqrt_q2fact1 %(0)s' ,'run_card dsqrt_q2fact2 %(0)s']),
                 'no_parton_cut':([],['run_card nocut T']),
                 'cm_velocity':([float], [lambda self :self.set_CM_velocity]),
@@ -6740,7 +6751,15 @@ def postcmd(self, stop, line):
             return ending_question
     
     
-    
+    def help_update(self):
+        logger.info(""" syntax: update dependent: Change the mass/width of particles which are not free parameter for the model.
+                    update missing:   add to the current param_card missing blocks/parameters.
+                    update to_slha1: pass SLHA2 card to SLHA1 convention. (beta)
+                    update to_slha2: pass SLHA1 card to SLHA2 convention. (beta)
+                    update to_full [run_card]
+                    update XXX [where XXX correspond to a hidden block of the run_card]:
+                    supported block are %s
+        """, ', '.join(self.update_block))
     
     
     def do_update(self, line, timer=0):
@@ -6756,6 +6775,8 @@ def do_update(self, line, timer=0):
             logger.warning('miss an argument (dependent or missing). Please retry')
             return
         
+        args[0] = args[0].lower()
+        
         if args[0] == 'dependent':
             if not self.mother_interface:
                 logger.warning('Failed to update dependent parameter. This might create trouble for external program (like MadSpin/shower/...)')
@@ -6805,10 +6826,11 @@ def do_update(self, line, timer=0):
             self.modified_card.add('run') # delay writting of the run_card
             logger.info('add optional block %s to the run_card', args[0])
         else:
-            self.help_update()
+            self.do_help('update')
             logger.warning('unvalid options for update command. Please retry')
 
 
+
     def update_to_full(self, line):
         """ trigger via update to_full LINE"""
         
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py
index 789976beee..c321fd88e5 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/extended_cmd.py
@@ -1317,6 +1317,8 @@ def nice_error_handling(self, error, line):
 
         debug_file = open(self.debug_output, 'a')
         traceback.print_exc(file=debug_file)
+        if __debug__:
+            traceback.print_exc()
         if hasattr(error, 'filename'):
             debug_file.write("Related File: %s\n" % error.filename)
         # Create a nice error output
@@ -1928,7 +1930,8 @@ def do_display(self, line, output=sys.stdout):
             for i, name in enumerate(split):
                 try:
                     __import__('.'.join(split[:i+1]))                    
-                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])))
+                    tmp = {}
+                    exec('%s=sys.modules[\'%s\']' % (split[i], '.'.join(split[:i+1])), globals(),tmp)
                 except ImportError:
                     try:
                         var = eval(args[1])
@@ -1939,7 +1942,7 @@ def do_display(self, line, output=sys.stdout):
                         outstr += 'EXTERNAL:\n'
                         outstr += misc.nice_representation(var, nb_space=4)                        
                 else:
-                    var = eval(args[1])
+                    var = eval(args[1], globals(), tmp)
                     outstr += 'EXTERNAL:\n'
                     outstr += misc.nice_representation(var, nb_space=4)                        
             
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py
index 526756129f..74ba0d195c 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/file_writers.py
@@ -140,10 +140,6 @@ def preprocess_template(self, input_lines, context={}):
         else:
             raise self.FileWriterError("%s not string" % repr(input_lines))
         
-        # Setup the contextual environment
-        for contextual_variable, value in context.items():
-            exec('%s=%s'%(str(contextual_variable),repr(value)))
-        
         res = []
         # The variable below tracks the conditional statements structure
         if_stack = []
@@ -166,7 +162,7 @@ def preprocess_template(self, input_lines, context={}):
             # Treat an if statement
             elif preproc_command.group('command')=='if':
                 try:
-                    if_stack.append(eval(preproc_command.group('body'))==True)
+                    if_stack.append(eval(preproc_command.group('body'), globals(), context)==True)
                 except Exception as e:
                     raise self.FilePreProcessingError('Could not evaluate'+\
                       "python expression '%s' given the context %s provided."%\
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py
index 551b71ddb6..3061b007e7 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/files.py
@@ -147,9 +147,14 @@ def cp(path1, path2, log=True, error=False):
     path2 = format_path(path2)
     try:
         shutil.copy(path1, path2)
+    except shutil.Error as why:
+        logger.debug('no cp since identical: %s', why)
+        return
     except IOError as why:
         import madgraph.various.misc as misc
         try: 
+            if 'same file' in  str(why):
+                return
             if os.path.exists(path2):
                 path2 = os.path.join(path2, os.path.split(path1)[1])
             misc.copytree(path1, path2)
@@ -157,12 +162,10 @@ def cp(path1, path2, log=True, error=False):
             if error:
                 raise
             if log:
-                logger.warning(why)
+                logger.warning("fail to cp", path1, path2, why)
             else:
-                misc.sprint("fail to cp", why)
-    except shutil.Error:
-        # idetical file
-        pass
+                misc.sprint("fail to cp",path1,path2, why)
+
 
 def rm(path, log=True):
     """removes path, that can be a single element or a list"""
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl
index 1810c6c082..6e0e06533d 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_cardhtml-pl
@@ -137,7 +137,7 @@ until($listpos>$#incard){
   print PAGE "<tr> <td> <b> Model:         <td> <b> $model </tr> \n";
   print PAGE "</b> \n </table> \n <center> \n";
   print PAGE "<td VALIGN=\"TOP\" WIDTH=\"50%\" BGCOLOR=\"#F0FFFF\" > \n";
-  print PAGE "<img SRC=\"HTML/card.jpg\"  ALT=\"\" BORDER=0  align=CENTER> \n";
+  print PAGE "<img SRC=\"HTML/card.png\"  ALT=\"\" BORDER=0  align=CENTER> \n";
   print PAGE "</center> \n";
   print PAGE "</td> \n </tr> \n <tr> \n";
   print PAGE "<td COLSPAN=2 ALIGN=CENTER > \n";
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py
index 681bf9d09b..3114a4350c 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_crossxhtml.py
@@ -133,7 +133,7 @@ class AllResults(dict):
     
     web = False 
     
-    _run_entries = ['cross', 'error','nb_event_pythia','run_mode','run_statistics',
+    _run_entries = ['cross', 'error','axsec','nb_event_pythia','run_mode','run_statistics',
                     'nb_event','cross_pythia','error_pythia',
                     'nb_event_pythia8','cross_pythia8','error_pythia8', 'shower_dir']
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl
index 87d03da394..31b7e9fe55 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_jpeg-pl
@@ -1,16 +1,16 @@
 #!/usr/bin/perl -w
 
 #---------------------------------------------------------------------
-# Run GS to create jpeg files defined as $gs
+# Run GS to create PNG files defined as $gs
 #---------------------------------------------------------------------
-system("/bin/bash -c \"rm -f matrix*.jpg\" ");
+system("/bin/bash -c \"rm -f matrix*.png\" ");
 
 $imatrix = "";
 if (! -e "matrix.ps") {$imatrix = 1;}
-$max_jpg = 2;
-if ($imatrix eq "") {$max_jpg = 5;}
-# add 1 to max_jpg, to get max_jpg pages
-$max_jpg += 1;
+$max_png = 2;
+if ($imatrix eq "") {$max_png = 5;}
+# add 1 to max_png, to get max_png pages
+$max_png += 1;
 open(PAGE,"> diagrams.html") || die "Error creating diagrams.html";
 print PAGE "\<HTML\> \n";
 print PAGE "\<HEAD\> \n";
@@ -21,22 +21,22 @@ while ( -e "matrix$imatrix.ps"){
   open(IN, "< matrix$imatrix.ps") || die "No file matrix$imatrix.ps";
   open(OUT, "> matrix-1.ps") || die "Could not open file matrix-1.ps";
   while (<IN>) {
-    if ($_ =~ m/^%%Page: $max_jpg $max_jpg/) {last;}
+    if ($_ =~ m/^%%Page: $max_png $max_png/) {last;}
     else {print OUT $_, "\n";}
   }
   close(OUT);
   close(IN);
-  system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=matrix$imatrix\%00d.jpg \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
+  system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-r150 \-sOutputFile\=matrix$imatrix\%00d.png \-q \-dNOPAUSE \-dBATCH matrix-1.ps > /dev/null\"";
   system "rm -f matrix-1.ps";
 
-# Determine how many jpg files we have
+# Determine how many png files we have
   $pages=1;
 
-  while(-e "matrix$imatrix$pages.jpg"){
+  while(-e "matrix$imatrix$pages.png"){
     $pages++;
   }#end of while
   #reduce it by one
-  if ($pages > $max_jpg){
+  if ($pages > $max_png){
     $pages -= 1;
   }
 # Find name of process
@@ -45,24 +45,24 @@ while ( -e "matrix$imatrix.ps"){
   if ($proc =~ /Process: (.+?)(\s\w+=\d+)*$/) { $proc = $1; }
   print PAGE "<P>\<A HREF\=\"matrix$imatrix.ps\" id\=\"$imatrix\"\> Postscript Diagrams for $proc\<\/A\> \<BR\> \n";
   for($j=1;$j<$pages;$j++){
-    print PAGE "\<IMG SRC=\"matrix$imatrix$j.jpg\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
+    print PAGE "\<IMG SRC=\"matrix$imatrix$j.png\" ALT=\"Page $j of $pages \" \> \<BR\> \n";
   }#end of for
 #
-#   In case I didn't include all of the diagrams as jpeg, warn user
+#   In case I didn't include all of the diagrams as PNG, warn user
 #
-  if (-e "matrix$imatrix$max_jpg.jpg" ) {
-    print PAGE "<P>To save bandwidth not all diagrams were converted to jpeg.";
+  if (-e "matrix$imatrix$max_png.png" ) {
+    print PAGE "<P>To save bandwidth not all diagrams were converted to PNG.";
     print PAGE "<P> To view all diagrams click on ";
     print PAGE "\<A HREF\=\"matrix$imatrix.ps\"\> postscript. \<\/A\> \<BR\> \n";
 #
 #    Delete files which aren't included in diagrams.html
 #
-    system ("/bin/bash -c \"rm -f matrix$max_jpg.jpg\" ");
+    system ("/bin/bash -c \"rm -f matrix$max_png.png\" ");
   }
 #
-#  Now create jpeg file for card
+#  Now create PNG file for card
 #
-  if (! -e "../../HTML/card.jpg") {
+  if (! -e "../../HTML/card.png") {
     system ("/bin/bash -c \"head -352 matrix$imatrix.ps >& junk.ps\" ");
     open(JUNK,">> junk.ps") || die "Error opening junk.ps";  
 
@@ -72,7 +72,7 @@ while ( -e "matrix$imatrix.ps"){
 
     system ("/bin/bash -c \"cat matrix$imatrix.ps | sed 1,352d >> junk.ps\" ");
 
-    system "/bin/bash -c \"nice gs  \-sDEVICE\=jpeg \-sOutputFile\=card.jpg \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.jpg ../../HTML/card.jpg > /dev/null\" ";
+    system "/bin/bash -c \"nice gs  \-sDEVICE\=pngmono \-sOutputFile\=card.png \-q \-dNOPAUSE \-dBATCH \-g180x150 ./junk.ps; rm -f junk.ps; cp -p card.png ../../HTML/card.png > /dev/null\" ";
   }
   if ($imatrix eq "") {$imatrix = 0;}
   $imatrix = $imatrix + 1;
@@ -82,3 +82,4 @@ print PAGE "\n";
 print PAGE "\<\/BODY\> \n";
 print PAGE "\<\/HTML\> \n";
 close(PAGE);
+
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py
index 415ecc9de0..d5d7fc8faf 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/gen_ximprove.py
@@ -30,6 +30,7 @@
 import stat
 import sys
 import six
+import time
 from six.moves import range
 from six.moves import zip
 
@@ -304,6 +305,7 @@ def get_helicity(self, to_submit=True, clean=True):
                     logger.debug('(%s) nb_hel: %s zero amp: %s bad_amps_hel: %s/%s', split_file[-1], len(good_hels),len(bad_amps),len(bad_amps_perhel), len(good_hels)*nb_amp )
                 if len(good_hels) == 1:
                     files.cp(matrix_file, matrix_file.replace('orig','optim'))
+                    files.cp(matrix_file.replace('.f','.o'), matrix_file.replace('orig','optim').replace('.f','.o'))
                     continue # avoid optimization if onlye one helicity
                 
                 gauge = self.cmd.proc_characteristics['gauge']
@@ -1059,6 +1061,7 @@ def __init__(self, cmd, opt=None):
         # parameter for the gridpack run
         self.nreq = 2000
         self.iseed = 4321
+        self.maxevts = 2500 
         
         # placeholder for information
         self.results = 0 #updated in launch/update_html
@@ -1200,6 +1203,10 @@ def reset_multijob(self):
     def write_multijob(self, Channel, nb_split):
         """ """
         if nb_split <=1:
+            try:
+                os.remove(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'))
+            except OSError:
+                pass
             return
         f = open(pjoin(self.me_dir, 'SubProcesses', Channel.get('name'), 'multijob.dat'), 'w')
         f.write('%i\n' % nb_split)
@@ -1828,17 +1835,17 @@ class gen_ximprove_gridpack(gen_ximprove_v4):
     max_request_event = 1e12         # split jobs if a channel if it needs more than that 
     max_event_in_iter = 4000
     min_event_in_iter = 500
-    combining_job = sys.maxsize
     gen_events_security = 1.00
 
-    def __new__(cls, *args, **opts):
+    def __new__(cls, cmd, opts):
 
         cls.force_class = 'gridpack'
-        return super(gen_ximprove_gridpack, cls).__new__(cls, *args, **opts)
+        return super(gen_ximprove_gridpack, cls).__new__(cls, cmd, opts)
 
-    def __init__(self, *args, **opts):
+    def __init__(self, cmd, opts):
         
         self.ngran = -1
+        self.nprocs = 1
         self.gscalefact = {}
         self.readonly = False
         if 'ngran' in opts:
@@ -1846,9 +1853,18 @@ def __init__(self, *args, **opts):
 #            del opts['ngran']
         if 'readonly' in opts:
             self.readonly = opts['readonly']
-        super(gen_ximprove_gridpack,self).__init__(*args, **opts)
+        if 'nprocs' in opts:
+            self.nprocs = int(opts['nprocs'])
+        if 'maxevts' in opts and self.nprocs > 1:
+            self.max_request_event = int(opts['maxevts'])
+        super(gen_ximprove_gridpack,self).__init__(cmd, opts)
         if self.ngran == -1:
             self.ngran = 1 
+
+        if self.nprocs > 1:
+            self.combining_job = 0
+        else:
+            self.combining_job = sys.maxsize
      
     def find_job_for_event(self):
         """return the list of channel that need to be improved"""
@@ -1876,8 +1892,8 @@ def find_job_for_event(self):
                 continue # no event to generate events
             self.gscalefact[tag] = max(1, 1/(goal_lum * C.get('axsec')/ self.ngran))
             #need to generate events
-            logger.debug('request events for ', C.get('name'), 'cross=',
-                  C.get('axsec'), 'needed events = ', goal_lum * C.get('axsec'))
+            logger.debug('request events for %s cross=%d needed events = %d',
+                         C.get('name'), C.get('axsec'), goal_lum * C.get('axsec'))
             to_refine.append(C) 
          
         logger.info('need to improve %s channels' % len(to_refine))    
@@ -1897,8 +1913,13 @@ def get_job_for_event(self):
         for C in to_refine:
             #1. Compute the number of points are needed to reach target
             needed_event = max(goal_lum*C.get('axsec'), self.ngran)
-            nb_split = 1
-            
+            nb_split = int(max(1,((needed_event-1)// self.max_request_event) +1))
+            if not self.split_channels:
+                nb_split = 1
+            if nb_split > self.max_splitting:
+                nb_split = self.max_splitting
+            nb_split=max(1, nb_split)
+           
             #2. estimate how many points we need in each iteration
             if C.get('nunwgt') > 0:
                 nevents =  needed_event / nb_split * (C.get('nevents') / C.get('nunwgt'))
@@ -1908,13 +1929,16 @@ def get_job_for_event(self):
                 nevents = self.max_event_in_iter
 
             if nevents < self.min_event_in_iter:
+                nb_split = int(nb_split * nevents / self.min_event_in_iter) + 1 # sr dangerous?
                 nevents = self.min_event_in_iter
             #
             # forbid too low/too large value
             nevents = max(self.min_event_in_iter, min(self.max_event_in_iter, nevents))
             logger.debug("%s : need %s event. Need %s split job of %s points", C.name, needed_event, nb_split, nevents)
             
-
+            # write the multi-job information
+            self.write_multijob(C, nb_split)
+            
             #create the  info dict  assume no splitting for the default
             info = {'name': self.cmd.results.current['run_name'],
                     'script_name': 'unknown',
@@ -1925,7 +1949,7 @@ def get_job_for_event(self):
                     'nevents': nevents, #int(nevents*self.gen_events_security)+1,
                     'maxiter': self.max_iter,
                     'miniter': self.min_iter,
-                    'precision': -1*int(needed_event)/C.get('axsec'),
+                    'precision': -goal_lum/nb_split, # -1*int(needed_event)/C.get('axsec'),
                     'requested_event': needed_event,
                     'nhel': self.run_card['nhel'],
                     'channel': C.name.replace('G',''),
@@ -1938,27 +1962,59 @@ def get_job_for_event(self):
                 basedir = pjoin(os.path.dirname(__file__), '..','..','SubProcesses', info['P_dir'], info['directory'])
                 info['base_directory'] = basedir
 
-            jobs.append(info)
-          
+            if nb_split == 1:
+                jobs.append(info)
+            else:
+                for i in range(nb_split):
+                    new_info = dict(info)
+                    new_info['offset'] = i+1
+                    new_info['directory'] += self.alphabet[i % 26] + str((i+1)//26)
+                    new_info['base_directory'] = info['directory']
+                    jobs.append(new_info)          
 
         write_dir = '.' if self.readonly else None  
         self.create_ajob(pjoin(self.me_dir, 'SubProcesses', 'refine.sh'), jobs, write_dir) 
         
+        if self.nprocs > 1:
+            nprocs_cluster = cluster.MultiCore(nb_core=self.nprocs)
+            gridpack_start = time.time()
+            def gridpack_wait_monitoring(Idle, Running, Done):
+                if Idle+Running+Done == 0:
+                    return
+                logger.info("Gridpack event generation: %s Idle, %s Running, %s Done [%s]" 
+                            % (Idle, Running, Done, misc.format_time(time.time()-gridpack_start)))
+
         done = []
         for j in jobs:
-            if j['P_dir'] in done:
-                continue
-            done.append(j['P_dir'])
+            if self.nprocs == 1:
+                if j['P_dir'] in done:
+                    continue
+                done.append(j['P_dir'])
+                # Give a little status. Sometimes these jobs run very long, and having hours without any
+                # console output can be a bit frightening and make users think we are looping.
+                if len(done)%5==0:
+                    logger.info(f"Working on job {len(done)} of {len(jobs)}")
+
             # set the working directory path.
             pwd = pjoin(os.getcwd(),j['P_dir']) if self.readonly else pjoin(self.me_dir, 'SubProcesses', j['P_dir'])
-            exe = pjoin(pwd, 'ajob1')
+            exe = pjoin(pwd, j['script_name'])
             st = os.stat(exe)
             os.chmod(exe, st.st_mode | stat.S_IEXEC)
 
             # run the code\
-            cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            if self.nprocs == 1:
+                cluster.onecore.launch_and_wait(exe, cwd=pwd, packet_member=j['packet'])
+            else:
+                nprocs_cluster.cluster_submit(exe, cwd=pwd, packet_member=j['packet'])
         write_dir = '.' if self.readonly else pjoin(self.me_dir, 'SubProcesses')
 
+        if self.nprocs > 1:
+            nprocs_cluster.wait(self.me_dir, gridpack_wait_monitoring)
+
+        if self.readonly:
+            combine_runs.CombineRuns(write_dir)
+        else:
+            combine_runs.CombineRuns(self.me_dir)
         self.check_events(goal_lum, to_refine, jobs, write_dir)
     
     def check_events(self, goal_lum, to_refine, jobs, Sdir):
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py
index 1471de4bcb..978ba6575e 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/hel_recycle.py
@@ -550,7 +550,7 @@ def get_jamp_lines(self, line):
     def get_amp2_lines(self, line):
         if line.startswith('      DO I = 1, NCOLOR'):
             self.in_amp2 = False
-        elif not line.isspace():
+        elif not line.isspace() and 'DENOM' not in line:
             self.template_dict['amp2_lines'] += f'{line[0:6]}  {self.add_indices(line[6:])}'
 
     def prepare_bools(self):
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py
index 51ae2914fc..6fbdd98100 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/histograms.py
@@ -1149,11 +1149,8 @@ def parse_one_histo_from_stream(self, stream, all_weight_header,
             boundaries = [0.0,0.0]
             for j, weight in \
                       enumerate(HwU.histo_bin_weight_re.finditer(line_bin)):
-                if (j == len(weight_header)):
-                    continue
-                if j == len(all_weight_header):
-                    raise HwU.ParseError("There is more bin weights"+\
-                              " specified than expected (%i)"%len(weight_header))
+                #if (j == len(weight_header)):
+                #    continue
                 if selected_central_weight == all_weight_header[j]:
                     bin_weights['central'] = float(weight.group('weight'))
                 if all_weight_header[j] == 'boundary_xmin':
@@ -1858,6 +1855,8 @@ def parse_histos_from_PY8_XML_stream(self, stream, run_id=None,
         # If merging cut is negative, then pick only the one of the central scale
         # If not specified, then take them all but use the PDF and scale weight
         # of the central merging_scale for the variation.
+        if not all_weights:
+            raise MadGraph5Error('No weights were found in the HwU XML source.')
         if merging_scale is None or merging_scale < 0.0:
             merging_scale_chosen = all_weights[2]['MERGING']
         else:
@@ -2480,14 +2479,14 @@ def get_main_central_plot_lines(HwU_name, block_position, color_index,
             # return [template_no_stat%rep_dic]+\
             #               ([template%rep_dic] if show_mc_uncertainties else [])
             
-            # The use of sqrt(-1) is just a trick to prevent the line to display
+            # The use of 1/0 is just a trick to prevent the line to display
             res = []
-            rep_dic['data'] = '($3 < 0 ? sqrt(-1) : $3)'
+            rep_dic['data'] = '($3 < 0 ? 1/0 : $3)'
             res.append(template_no_stat%rep_dic)
             rep_dic['title'] = " title ''"
             if show_mc_uncertainties:
                 res.append(template%rep_dic)                
-            rep_dic['data'] = '($3 >= 0 ? sqrt(-1) : abs($3))'
+            rep_dic['data'] = '($3 >= 0 ? 1/0 : abs($3))'
             rep_dic['ls']  = ' ls %d'%(100+color_index)            
             res.append(template_no_stat%rep_dic)
             if show_mc_uncertainties:
@@ -2739,13 +2738,13 @@ def ratio_no_correlations(wgtsA, wgtsB):
 """#-- rendering subhistograms '%(subhistogram_type)s'
 %(unset label)s
 %(set_format_y)s
+%(set_yscale)s
 set yrange [%(ymin).4e:%(ymax).4e]
 set origin %(origin_x).4e, %(origin_y).4e
 set size %(size_x).4e, %(size_y).4e
 set mytics %(mytics)d
 %(set_ytics)s
 %(set_format_x)s
-%(set_yscale)s
 %(set_ylabel)s
 %(set_histo_label)s
 plot \\"""
@@ -2878,7 +2877,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 
                 # We decide to show uncertainties in the main plot only if they
                 # are part of a monocolor band. Otherwise, they will only be 
-                # shown in the first subplot. Notice that plotting 'sqrt(-1)' 
+                # shown in the first subplot. Notice that plotting '1/0'
                 # is just a trick so as to have only the key printed with no
                 # line
                 
@@ -2890,7 +2889,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                         '%s, scale variation'%title, band='scale' in use_band)
                     else:
                       uncertainty_plot_lines[-1]['scale'] = \
-      ["sqrt(-1) ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
+      ["1/0 ls %d title '%s'"%(color_index+10,'%s, scale variation'%title)]
                 # And now PDF_variation if available
                 if not PDF_var_pos is None and len(PDF_var_pos)>0:
                     if 'pdf' in use_band:
@@ -2899,7 +2898,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                              '%s, PDF variation'%title, band='pdf' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['pdf'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+20,'%s, PDF variation'%title)]
                 # And now merging variation if available
                 if not merging_var_pos is None and len(merging_var_pos)>0:
                     if 'merging_scale' in use_band:
@@ -2908,7 +2907,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                 '%s, merging scale variation'%title, band='merging_scale' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['merging_scale'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]                        
+        ["1/0 ls %d title '%s'"%(color_index+30,'%s, merging scale variation'%title)]
                 # And now alpsfact variation if available
                 if not alpsfact_var_pos is None and len(alpsfact_var_pos)>0:
                     if 'alpsfact' in use_band:
@@ -2917,7 +2916,7 @@ def ratio_no_correlations(wgtsA, wgtsB):
                     '%s, alpsfact variation'%title, band='alpsfact' in use_band)
                     else:
                         uncertainty_plot_lines[-1]['alpsfact'] = \
-        ["sqrt(-1) ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
+        ["1/0 ls %d title '%s'"%(color_index+40,'%s, alpsfact variation'%title)]
 
 #            plot_lines.append(
 # "'%s' index %d using (($1+$2)/2):3 ls %d title '%s'"\
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
index 0924927785..262d39a736 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/launch_plugin.py
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: O. Mattelaer (Aug 2023) for the MG5aMC CUDACPP plugin.
-# Further modified by: O. Mattelaer, A. Valassi (2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: O. Mattelaer, A. Valassi, Z. Wettersten (2024-2025) for the MG5aMC CUDACPP plugin.
 
 import logging
 import os
@@ -33,7 +33,7 @@ def compile(self, *args, **opts):
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'] })
+                {'override FPTYPE': self.run_card['floating_type'] })
             misc.sprint('FPTYPE checked')
         cudacpp_supported_backends = [ 'fortran', 'cuda', 'hip', 'cpp', 'cppnone', 'cppsse4', 'cppavx2', 'cpp512y', 'cpp512z', 'cppauto' ]
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
@@ -76,7 +76,7 @@ def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'override FPTYPE': new_value})
         else:
             raise Exception
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
@@ -133,7 +133,8 @@ def default_setup(self):
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'cuda'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        self['vector_size'] = 32 # ZW: default to 32, might want to change to 64 to utilise AMD GPUs better as well # 16384 # already setup in default class (just change value)
+        self['nb_warp'] = 512 # number of warps per kernel call, for now setting to 16 384 / vector_size
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py
index f6e47956cd..81d17f7cb1 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/lhe_parser.py
@@ -1035,12 +1035,12 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 from_init = True
 
             if not from_init:
-                if group in grouped_cross:
-                    grouped_cross[group] += self.allcross[i]
-                    grouped_error[group] += self.error[i]**2 
+                if int(group) in grouped_cross:
+                    grouped_cross[int(group)] += self.allcross[i]
+                    grouped_error[int(group)] += self.error[i]**2 
                 else:
-                    grouped_cross[group] = self.allcross[i]
-                    grouped_error[group] = self.error[i]**2
+                    grouped_cross[int(group)] = self.allcross[i]
+                    grouped_error[int(group)] = self.error[i]**2
             else:
                 ban = banner_mod.Banner(ff.banner)
                 for line in  ban['init'].split('\n'):
@@ -1048,11 +1048,11 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                     if len(splitline)==4:
                         cross, error, _, group = splitline
                         if int(group) in grouped_cross:
-                            grouped_cross[group] += float(cross)
-                            grouped_error[group] += float(error)**2                        
+                            grouped_cross[int(group)] += float(cross)
+                            grouped_error[int(group)] += float(error)**2                        
                         else:
-                            grouped_cross[group] = float(cross)
-                            grouped_error[group] = float(error)**2                             
+                            grouped_cross[int(group)] = float(cross)
+                            grouped_error[int(group)] = float(error)**2                             
         nb_group = len(grouped_cross)
         
         # compute the information for the first line 
@@ -1086,6 +1086,8 @@ def define_init_banner(self, wgt, lha_strategy, proc_charac=None):
                 self.seek(0)
             if init_information["idbmup2"] in [0,9]:
                 event = next(self)
+                if len(event) == 0:
+                    event = Event(str(event))
                 init_information["idbmup2"] = event[1].pdg
                 self.seek(0)
         
@@ -2166,10 +2168,13 @@ def check(self):
             abspz += abs(particle.pz)
             # check mass
             fourmass = FourMomentum(particle).mass
-            
-            if particle.mass and (abs(particle.mass) - fourmass)/ abs(particle.mass) > threshold:
-                raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
-                
+            if particle.mass:
+                expected = (particle.E - math.sqrt(particle.E**2 -particle.mass**2))/particle.E
+                if expected > 1e-8:
+                    mass_threshold = particle.E**2 - (particle.E-threshold)**2
+                    if  (abs(particle.mass) - fourmass)/ mass_threshold > 5:
+                        raise Exception( "Do not have correct mass lhe: %s momentum: %s (error at %s" % (particle.mass, fourmass, (abs(particle.mass) - fourmass)/ abs(particle.mass)))
+                    
 
         if E/absE > threshold:
             logger.critical(self)
@@ -2953,8 +2958,8 @@ def pt(self):
     
     @property
     def pseudorapidity(self):
-        norm = math.sqrt(self.px**2 + self.py**2+self.pz**2)
-        return  0.5* math.log((norm - self.pz) / (norm + self.pz))
+        norm = math.sqrt(self.px**2 + self.py**2 + self.pz**2)
+        return  0.5* math.log((norm + self.pz) / (norm - self.pz))
     
     @property
     def rapidity(self):
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py
index 85e5bcf5e3..4d5597c722 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/madevent_interface.py
@@ -1171,10 +1171,10 @@ def check_survey(self, args, cmd='survey'):
                 for opt,value in self._survey_options.items():
                     if arg.startswith('--%s=' % opt):
                         exec('self.opts[\'%s\'] = %s(arg.split(\'=\')[-1])' % \
-                             (opt, value[0]))
+                                (opt, value[0]), globals(), {'self':self, 'arg':arg})
                         arg = ""
                 if arg != "": raise Exception
-            except Exception:
+            except Exception as error:
                 self.help_survey()
                 raise self.InvalidCmd('invalid %s argument'% arg)
 
@@ -2827,10 +2827,10 @@ def print_results_in_shell(self, data):
                         logger.info("     Nb of events after matching/merging :  %d" % int(data['nb_event_pythia']))
                 if self.run_card['use_syst'] in self.true and \
                    (int(self.run_card['ickkw'])==1 or self.run_card['ktdurham']>0.0
-                                                    or self.run_card['ptlund']>0.0):
+                                                    or self.run_card['ptlund']>0.0) and data['cross_pythia'] == -1:
                     logger.info("     Notice that because Systematics computation is turned on, the merging did not veto events but modified their weights instead.\n"+\
                                 "     The resulting hepmc/stdhep file should therefore be use with those weights.")
-                else:
+                elif data['cross_pythia'] == -1:
                     logger.info("     Nb of events after merging :  %s" % data['nb_event_pythia'])
 
         logger.info(" " )
@@ -3055,6 +3055,7 @@ def do_multi_run(self, line):
         crossoversig = 0
         inv_sq_err = 0
         nb_event = 0
+        madspin = False
         for i in range(nb_run):
             self.nb_refine = 0
             self.exec_cmd('generate_events %s_%s -f' % (main_name, i), postcmd=False)
@@ -3067,6 +3068,8 @@ def do_multi_run(self, line):
             inv_sq_err+=1.0/error**2
             self.results[main_name][-1]['cross'] = crossoversig/inv_sq_err
             self.results[main_name][-1]['error'] = math.sqrt(1.0/inv_sq_err)
+            if 'decayed' in self.run_name:
+                madspin = True
         self.results.def_current(main_name)
         self.run_name = main_name
         self.update_status("Merging LHE files", level='parton')
@@ -3074,9 +3077,12 @@ def do_multi_run(self, line):
             os.mkdir(pjoin(self.me_dir,'Events', self.run_name))
         except Exception:
             pass
-        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
+
+        os.system('%(bin)s/merge.pl %(event)s/%(name)s_*%(madspin)s/unweighted_events.lhe.gz %(event)s/%(name)s/unweighted_events.lhe.gz %(event)s/%(name)s_banner.txt' 
                   % {'bin': self.dirbin, 'event': pjoin(self.me_dir,'Events'),
-                     'name': self.run_name})
+                     'name': self.run_name,
+                     'madspin': '_decayed_*' if madspin else ''
+                     })
 
         eradir = self.options['exrootanalysis_path']
         if eradir and misc.is_executable(pjoin(eradir,'ExRootLHEFConverter')):
@@ -3656,9 +3662,11 @@ def do_refine(self, line):
         else:
             self.refine_mode = "new"
             
-        cross, error = self.make_make_all_html_results()
+        cross, error, across = self.make_make_all_html_results(get_attr=('xsec','xerru','axsec'))
+        
         self.results.add_detail('cross', cross)
         self.results.add_detail('error', error)
+        self.results.add_detail('axsec', across)
 
         self.results.add_detail('run_statistics', 
                                 dict(self.results.get_detail('run_statistics')))
@@ -3757,6 +3765,8 @@ def split(a, n):
             k, m = divmod(len(a), n)
             return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
 
+        Gdirs = self.remove_empty_events(Gdirs)
+        
         partials_info = [] 
         if len(Gdirs) >= max_G:
             start_unweight= time.perf_counter()
@@ -3786,7 +3796,7 @@ def split(a, n):
             for i, local_G in enumerate(split(Gdirs, nb_chunk)):
                 line = [pjoin(self.me_dir, "Events", self.run_name, "partials%d.lhe.gz" % i)]
                 line.append(pjoin(self.me_dir, 'Events', self.run_name, '%s_%s_banner.txt' % (self.run_name, tag)))
-                line.append(str(self.results.current['cross']))
+                line.append(str(self.results.current.get('axsec')))
                 line += local_G
                 partials_info.append(self.do_combine_events_partial(' '.join(line), preprocess_only=True))
                 mycluster.submit(sys.executable, 
@@ -4223,7 +4233,7 @@ def mg5amc_py8_interface_consistency_warning(options):
     
         return None
 
-    def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
+    def setup_Pythia8RunAndCard(self, PY8_Card, run_type, use_mg5amc_py8_interface):
         """ Setup the Pythia8 Run environment and card. In particular all the process and run specific parameters
         of the card are automatically set here. This function returns the path where HEPMC events will be output,
         if any."""
@@ -4338,10 +4348,10 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.systemSet('Beams:setProductionScalesFromLHEF',True)
 
             # Automatically set qWeed to xqcut if not defined by the user.
-            if PY8_Card['SysCalc:qWeed']==-1.0:
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qWeed']==-1.0:
                 PY8_Card.MadGraphSet('SysCalc:qWeed',self.run_card['xqcut'], force=True)
 
-            if PY8_Card['SysCalc:qCutList']=='auto':
+            if use_mg5amc_py8_interface and PY8_Card['SysCalc:qCutList']=='auto':
                 if self.run_card['use_syst']:
                     if self.run_card['sys_matchscale']=='auto':
                         qcut = PY8_Card['JetMatching:qCut']
@@ -4368,7 +4378,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             # Specific MLM settings
             # PY8 should not implement the MLM veto since the driver should do it
             # if merging scale variation is turned on
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('JetMatching:doVeto',False)
@@ -4444,7 +4454,7 @@ def setup_Pythia8RunAndCard(self, PY8_Card, run_type):
             PY8_Card.MadGraphSet('SpaceShower:pTmaxMatch',1)
             PY8_Card.MadGraphSet('SpaceShower:rapidityOrder',False)
             # PY8 should not implement the CKKW veto since the driver should do it.
-            if self.run_card['use_syst']:
+            if use_mg5amc_py8_interface and self.run_card['use_syst']:
                 # We do no force it here, but it is clear that the user should know what
                 # he's doing if he were to force it to True.
                 PY8_Card.MadGraphSet('Merging:applyVeto',False)
@@ -4516,6 +4526,12 @@ def do_pythia8(self, line):
         else:
             no_default = False
 
+        if '--old_interface' in args:
+            use_mg5amc_py8_interface = True
+            args.remove('--old_interface')
+        else:
+            use_mg5amc_py8_interface = False
+              
         if not self.run_name:
             self.check_pythia8(args)
             self.configure_directory(html_opening =False)
@@ -4545,20 +4561,27 @@ def do_pythia8(self, line):
              #"Please use 'event_norm = average' in the run_card to avoid this problem.")
 
 
-        
-        if not self.options['mg5amc_py8_interface_path'] or not \
-             os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
-                                                       'MG5aMC_PY8_interface')):
-            raise self.InvalidCmd(
-"""The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
-Please install this tool with the following MG5_aMC command:
-  MG5_aMC> install mg5amc_py8_interface_path""")
+        if use_mg5amc_py8_interface:
+            if not self.options['mg5amc_py8_interface_path'] or not \
+                os.path.exists(pjoin(self.options['mg5amc_py8_interface_path'],
+                                                        'MG5aMC_PY8_interface')):
+                raise self.InvalidCmd(
+    """The MG5aMC_PY8_interface tool cannot be found, so that MadEvent cannot steer Pythia8 shower.
+    Please install this tool with the following MG5_aMC command:
+    MG5_aMC> install mg5amc_py8_interface_path""")
+            else:
+                pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
+                                                            'MG5aMC_PY8_interface')
+                warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
+                if warnings:
+                    logger.warning(warnings)
         else:
-            pythia_main = pjoin(self.options['mg5amc_py8_interface_path'],
-                                                         'MG5aMC_PY8_interface')
-            warnings = MadEventCmd.mg5amc_py8_interface_consistency_warning(self.options)
-            if warnings:
-                logger.warning(warnings)
+            pythia_main = pjoin(self.options['pythia8_path'], 'share', 'Pythia8', 'examples', 'main164')
+            if not os.path.exists(pythia_main):
+               pythia_main = pjoin(self.options['pythia8_path'], 'examples', 'main164') 
+            if not os.path.exists(pythia_main):
+                logger.warning('main164 not found (or not compiled). Will try the old interface instead.')
+                return self.do_pythia8(line + ' --old_interface')
 
         self.results.add_detail('run_mode', 'madevent')
 
@@ -4583,14 +4606,19 @@ def do_pythia8(self, line):
             run_type = 'CKKW'
 
         # Edit the card and run environment according to the run specification
-        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type)
+        HepMC_event_output = self.setup_Pythia8RunAndCard(PY8_Card, run_type, use_mg5amc_py8_interface=use_mg5amc_py8_interface)
+
 
+        if not use_mg5amc_py8_interface and self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+            PY8_Card['Main:numberOfEvents']= self.run_card['nevents']
+               
         # Now write the card.
         pythia_cmd_card = pjoin(self.me_dir, 'Events', self.run_name ,
                                                          '%s_pythia8.cmd' % tag)
         cmd_card = StringIO.StringIO()
         PY8_Card.write(cmd_card,pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                       direct_pythia_input=True)
+                                                       direct_pythia_input=True,
+                                                       use_mg5amc_py8_interface=use_mg5amc_py8_interface)
         
         # Now setup the preamble to make sure that everything will use the locally
         # installed tools (if present) even if the user did not add it to its
@@ -4632,7 +4660,7 @@ def do_pythia8(self, line):
                   " command '/usr/bin/env %s' exists and returns a valid path."%shell)
                 
         exe_cmd = "#!%s\n%s"%(shell_exe,' '.join(
-                     [preamble+pythia_main,
+                     [preamble+pythia_main, '' if use_mg5amc_py8_interface else '-c',
                       os.path.basename(pythia_cmd_card)]))
 
         wrapper.write(exe_cmd)
@@ -4699,6 +4727,7 @@ def do_pythia8(self, line):
                 n_cores = max(min(min_n_core,n_cores),1)
 
             if self.options['run_mode']==0 or (self.options['run_mode']==2 and self.options['nb_core']==1):
+
                 # No need for parallelization anymore
                 self.cluster = None
                 logger.info('Follow Pythia8 shower by running the '+
@@ -4744,20 +4773,22 @@ def do_pythia8(self, line):
                 ParallelPY8Card.subruns[0].systemSet('Beams:LHEF','events.lhe.gz')
                 ParallelPY8Card.write(pjoin(parallelization_dir,'PY8Card.dat'),
                                       pjoin(self.me_dir,'Cards','pythia8_card_default.dat'),
-                                                                    direct_pythia_input=True)
+                                                                    direct_pythia_input=True,
+                              use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                 # Write the wrapper
                 wrapper_path = pjoin(parallelization_dir,'run_PY8.sh')
                 wrapper = open(wrapper_path,'w')
                 if self.options['cluster_temp_path'] is None:
                     exe_cmd = \
-"""#!%s 
-./%s PY8Card.dat >& PY8_log.txt
-"""
+"""#!%%s 
+./%%s %s  PY8Card.dat >& PY8_log.txt
+"""  % ('' if use_mg5amc_py8_interface else '-c')
+
                 else: 
                     exe_cmd = \
-"""#!%s
+"""#!%%s
 ln -s ./events_$1.lhe.gz ./events.lhe.gz
-./%s PY8Card_$1.dat >& PY8_log.txt
+./%%s %s PY8Card_$1.dat >& PY8_log.txt
 mkdir split_$1
 if [ -f ./events.hepmc ];
 then
@@ -4776,7 +4807,7 @@ def do_pythia8(self, line):
    mv ./PY8_log.txt ./split_$1/
 fi
 tar -czf split_$1.tar.gz split_$1
-"""
+""" % ('' if use_mg5amc_py8_interface else '-c')
                 exe_cmd = exe_cmd%(shell_exe,os.path.basename(pythia_main))
                 wrapper.write(exe_cmd)
                 wrapper.close()
@@ -4812,19 +4843,27 @@ def do_pythia8(self, line):
                                 pjoin(parallelization_dir,split_files[-1]))
                 
                 logger.info('Submitting Pythia8 jobs...')
+
                 for i, split_file in enumerate(split_files):
                     # We must write a PY8Card tailored for each split so as to correct the normalization
                     # HEPMCoutput:scaling of each weight since the lhe showered will not longer contain the
                     # same original number of events
-                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'))
+                    split_PY8_Card = banner_mod.PY8Card(pjoin(parallelization_dir,'PY8Card.dat'), setter='user')
+                    assert split_PY8_Card['JetMatching:nJetMax'] ==  PY8_Card['JetMatching:nJetMax']
+
+        
+
                     # Make sure to sure the number of split_events determined during the splitting.
-                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i])
+                    split_PY8_Card.systemSet('Main:numberOfEvents',partition_for_PY8[i], force=True)
+                    assert split_PY8_Card['Main:numberOfEvents'] == partition_for_PY8[i]
                     split_PY8_Card.systemSet('HEPMCoutput:scaling',split_PY8_Card['HEPMCoutput:scaling']*
-                                                             (float(partition_for_PY8[i])))
+                                                             (float(partition_for_PY8[i])), force=True)
                     # Add_missing set to False so as to be sure not to add any additional parameter w.r.t
                     # the ones in the original PY8 param_card copied.
                     split_PY8_Card.write(pjoin(parallelization_dir,'PY8Card_%d.dat'%i),
-                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False)
+                                         pjoin(parallelization_dir,'PY8Card.dat'), add_missing=False,
+                                         direct_pythia_input=True,
+                                         use_mg5amc_py8_interface=use_mg5amc_py8_interface)
                     in_files = [pjoin(parallelization_dir,os.path.basename(pythia_main)),
                                 pjoin(parallelization_dir,'PY8Card_%d.dat'%i), 
                                 pjoin(parallelization_dir,split_file)]
@@ -5073,7 +5112,7 @@ def wait_monitoring(Idle, Running, Done):
                 # works both for fixed number of generated events and fixed accepted events
                 self.results.add_detail('error_pythia', error_m)
 
-            if self.run_card['use_syst']:
+            if self.run_card['use_syst'] and use_mg5amc_py8_interface:
                     self.results.add_detail('cross_pythia', -1)
                     self.results.add_detail('error_pythia', 0)
 
@@ -6132,7 +6171,102 @@ def get_Gdir(self, Pdir=None, symfact=None):
                     mfactors[pjoin(P, "G%s" % tag)] = mfactor
         self.Gdirs = (Gdirs, mfactors)
         return self.get_Gdir(Pdir, symfact=symfact)
+
+    ############################################################################
+    def remove_empty_events(self, Gdir):
+        """return Gdir strip from the one providing empty events.lhe files."""
+
+        reasons = collections.defaultdict(list)
+        Gdirs = Gdir[:]
+        for G in Gdirs[:]:
+            try:
+                size = os.path.getsize(pjoin(G, 'events.lhe'))
+            except Exception as error:
+                size = 0 
+            if size <10:
+                Gdirs.remove(G)
+                try:
+                    log = misc.BackRead(pjoin(G, 'log.txt'))
+                except Exception as error:
+                    log = misc.BackRead(pjoin(G, 'run1_app.log'))
                 
+                found = -1
+                for line in log:
+                    if 'Deleting file events.lhe' in line:
+                        found = 0
+                    elif "Impossible BW configuration" in line:
+                        reasons['bwconfig'].append(G)
+                        break
+                    elif found < -150:
+                        reasons['not found'].append(G)
+                        Gdirs.append(G)
+                        break
+                    elif found < 0:
+                        found -= 1
+                    elif 'Loosen cuts or increase max_events' in line:
+                        reasons['cuts'].append(G)
+                        break
+                    elif 'all returned zero' in line:
+                        reasons['zero'].append(G)
+                        break
+                    elif found > 5:
+                        reasons['unknown'].append(G)
+                        break
+                    else:
+                        found += 1
+        
+        if len(reasons):
+            logger.debug('Reasons for empty events.lhe:')
+            if len(reasons['unknown']):
+                logger.debug('  - unknown: %s' % len(reasons['unknown']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['unknown'][:10]]))
+            if len(reasons['not found']):
+                logger.debug('  - not found in log: %s' % len(reasons['not found']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['not found'][:10]]))
+            if len(reasons['zero']):
+                logger.debug('  - zero amplitudes: %s' % len(reasons['zero']))
+                logger.log(10,  '    DETAIL:' + ','.join(['/'.join(G.rsplit( os.sep)[-2:]) for G in reasons['zero'][:10]]))
+            if len(reasons['bwconfig']):
+                critical_bwconfig = set()
+                for G in reasons['bwconfig']:                    
+                    base = G.rsplit('.',1)[0]
+                    if any(G2.startswith(base) for G2 in Gdirs):
+                        continue
+                    else:
+                        critical_bwconfig.add(os.sep.join(base.rsplit(os.sep)[-2:]))
+                for G in critical_bwconfig:
+                    logger.warning('Gdirectory %s has no events.lhe file. (no BW config found %s times)' % G) 
+
+                logger.debug('  - impossible BW configuration: %s' % len(reasons['bwconfig']))
+                logger.debug('  - channel with no possible BW configuration: %s' %  len(critical_bwconfig))
+
+            if len(reasons['cuts']):
+                critical_nb_cuts = collections.defaultdict(int)
+                for G in reasons['cuts']:
+                    if '.' in os.path.basename(G):
+                        base = G.rsplit('.',1)[0]
+                        if any(G2.startswith(base) for G2 in Gdirs):
+                            continue
+                        else:
+                            critical_nb_cuts[os.sep.join(base.rsplit(os.sep)[-2:])] += 1
+                    else:
+                        critical_nb_cuts[''] += 1
+                        logger.warning('Gdirectory %s has no events.lhe file. (no points passed cuts found)' % G)
+                for G, nb in critical_nb_cuts.items():
+                    if not G:
+                        continue
+                    else:
+                        logger.warning('%s  channel %s.XXX has no events.lhe file. (no points passed cuts). No %s with events detected' % (nb, G, G))
+                logger.debug('  - no points passed cuts: %s' % len(reasons['cuts']))
+                logger.log(10, '    DETAIL:' + ','.join(['/'.join(G.rsplit(os.sep)[-2:]) for G in reasons['cuts'][:10]]))
+                logger.debug('    - without any BW handling (critical): %s' % critical_nb_cuts[''])
+                logger.debug('    - with BW but all zero (critical): %s' % sum([nb for v, nb in critical_nb_cuts.items() if v!=''], 0))
+                #logger.debug('  - cuts (with BW conflict where other channel contributes): %s' % (len(reasons['cuts'])- critical_nb_cuts))
+
+
+        return Gdirs
+
+
     ############################################################################
     def set_run_name(self, name, tag=None, level='parton', reload_card=False,
                      allow_new_tag=True):
@@ -6749,7 +6883,7 @@ def get_subP_ids(path):
 class GridPackCmd(MadEventCmd):
     """The command for the gridpack --Those are not suppose to be use interactively--"""
 
-    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **stdin):
+    def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, nprocs=1, maxevts=2500, *completekey, **stdin):
         """Initialize the command and directly run"""
 
         # Initialize properly
@@ -6759,6 +6893,8 @@ def __init__(self, me_dir = None, nb_event=0, seed=0, gran=-1, *completekey, **s
         self.random = seed
         self.random_orig = self.random
         self.granularity = gran
+        self.nprocs = nprocs
+        self.maxevts = maxevts
         
         self.options['automatic_html_opening'] = False
         #write the grid_card.dat on disk
@@ -6874,7 +7010,7 @@ def launch(self, nb_event, seed):
         #misc.call([pjoin(self.me_dir,'bin','refine4grid'),
         #                str(nb_event), '0', 'Madevent','1','GridRun_%s' % seed],
         #                cwd=self.me_dir)
-        self.refine4grid(nb_event)
+        self.gridpack_cross = self.refine4grid(nb_event)
 
         # 3) Combine the events/pythia/...
         self.exec_cmd('combine_events')
@@ -6902,6 +7038,8 @@ def refine4grid(self, nb_event):
         
         precision = nb_event
 
+        across= self.make_make_all_html_results(get_attr='axsec')
+
         self.opts = dict([(key,value[1]) for (key,value) in \
                           self._survey_options.items()])
         
@@ -6915,8 +7053,9 @@ def refine4grid(self, nb_event):
         self.update_status('Refine results to %s' % precision, level=None)
         logger.info("Using random number seed offset = %s" % self.random)
 
-        refine_opt = {'err_goal': nb_event, 'split_channels': False,
-                      'ngran':self.granularity, 'readonly': self.readonly}   
+        refine_opt = {'err_goal': nb_event, 'split_channels': True,
+                      'ngran':self.granularity, 'readonly': self.readonly,
+                      'nprocs': self.nprocs, 'maxevts': self.maxevts}
         x_improve = gen_ximprove.gen_ximprove_gridpack(self, refine_opt)
         x_improve.launch() # create the ajob for the refinment and run those!
         self.gscalefact = x_improve.gscalefact #store jacobian associate to the gridpack 
@@ -6926,7 +7065,7 @@ def refine4grid(self, nb_event):
         #print 'run combine!!!'
         #combine_runs.CombineRuns(self.me_dir)
         
-        return
+        return across
         #update html output
         Presults = sum_html.collect_result(self)
         cross, error = Presults.xsec, Presults.xerru
@@ -7051,10 +7190,13 @@ def do_combine_events(self, line):
                 sum_axsec += result.get('axsec')*gscalefact[Gdir]
                 
                 if len(AllEvent) >= 80: #perform a partial unweighting
-                    if self.results.current['cross'] == 0 and self.run_card['gridpack']:
-                        nb_event= self.nb_event
+                    if not self.results.current.get('axsec'):
+                        if self.run_card['gridpack'] and self.gridpack_cross:
+                            nb_event = min(abs(1.05*self.nb_event*sum_axsec/self.gridpack_cross),self.nb_event)
+                        else:
+                            nb_event= self.nb_event
                     else:
-                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current['cross']),self.run_card['nevents'])
+                        nb_event = min(abs(1.01*self.nb_event*sum_axsec/self.results.current.get('axsec')),self.run_card['nevents'], self.nb_event, self.gridpack_cross, sum_axsec)
                     AllEvent.unweight(pjoin(outdir, self.run_name, "partials%s.lhe.gz" % partials),
                           get_wgt, log_level=5,  trunc_error=1e-2, event_target=nb_event)
                     AllEvent = lhe_parser.MultiEventFile()
@@ -7068,6 +7210,7 @@ def do_combine_events(self, line):
         
         for data in partials_info:
             AllEvent.add(*data)
+            sum_xsec += data[1]
 
         if not hasattr(self,'proc_characteristic'):
             self.proc_characteristic = self.get_characteristics()
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data
index 6205bb9567..407ed7aa91 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/restore_data
@@ -48,8 +48,17 @@ for i in `cat subproc.mg` ; do
     cd ../
 done
 
+# check if we are on a Mac, otherwise assume Linux
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # no nproc on Mac, so use sysctl instead
+    # use -S1024 because there is a limit on the length of the command
+    xargs_opts="-P $(sysctl -n hw.ncpu) -S1024"
+else
+    xargs_opts="-P $(nproc --all)"
+fi
+
 find . -mindepth 2 -maxdepth 2 -type d -name 'G*' -print0 \
-    | xargs --null -P "$(nproc --all)" -I{} bash -c "
+    | xargs --null ${xargs_opts} -I{} bash -c "
 cd {}
 for j in $1_results.dat ; do
     if [[ -e \$j ]] ; then
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py
index 9dd5826f71..fb8dd3a74a 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/internal/sum_html.py
@@ -770,7 +770,7 @@ def collect_result(cmd, folder_names=[], jobs=None, main_dir=None):
     return all
 
 
-def make_all_html_results(cmd, folder_names = [], jobs=[]):
+def make_all_html_results(cmd, folder_names = [], jobs=[], get_attr=None):
     """ folder_names and jobs have been added for the amcatnlo runs """
     run = cmd.results.current['run_name']
     if not os.path.exists(pjoin(cmd.me_dir, 'HTML', run)):
@@ -794,7 +794,12 @@ def make_all_html_results(cmd, folder_names = [], jobs=[]):
     fsock.write('%s <dl>' % Presults.get_html(run, unit, cmd.me_dir))
     fsock.write('%s </dl></body>' % P_text)
 
-    return Presults.xsec, Presults.xerru
+    if not get_attr:
+        return Presults.xsec, Presults.xerru
+    else:
+        if isinstance(get_attr, tuple):
+            return [getattr(Presults, _) for _ in get_attr]
+        return getattr(Presults, get_attr)
 
             
 
diff --git a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent
index dff9711b73..9c5363e682 100755
--- a/epochX/cudacpp/susy_gg_tt.mad/bin/madevent
+++ b/epochX/cudacpp/susy_gg_tt.mad/bin/madevent
@@ -178,6 +178,17 @@ force_run = False
 if (args and args[0] == 'treatcards'):
     force_run=True    
 
+
+# check that madgraph is not in PYTHONPATH
+try:
+    import madgraph
+except ImportError:
+    pass
+else:
+    logger.getLogger('madgraph').error('Looks like you do have madgraph in your PYTHONPATH (or you run this executable from the main MG5aMC directory). This executable will likely not work in such case.')
+
+
+
 # Call the cmd interface main loop
 try:
     if '-h' in args or '--help' in args:
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h
index 9ed58e24f1..22a10cc1e3 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/HelAmps_MSSM_SLHA2.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -962,7 +962,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -974,7 +974,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -987,7 +987,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1006,7 +1006,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1019,7 +1019,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc
index aa00d6a9e4..0a62a7059c 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h
index 3e29f2ccbe..2b51d933c5 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/Parameters_MSSM_SLHA2.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -878,7 +878,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -888,10 +888,10 @@ namespace mg5amcCpu
     using namespace Parameters_MSSM_SLHA2_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
-    fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 );
-    cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s );
-    cxtype_sv_ref GC_51s_sv = C_ACCESS::kernelAccess( GC_51s );
+    fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
+    fptype* GC_51s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 );
+    cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s );
+    cxtype_sv_ref GC_51s_sv = CD_ACCESS::kernelAccess( GC_51s );
     GC_6s_sv = couplings_sv.GC_6;
     GC_51s_sv = couplings_sv.GC_51;
     mgDebug( 1, __FUNCTION__ );
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
index 7c6a082392..ca859a602e 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
index 420090461f..0cf1202c7e 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
+++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt
@@ -15,7 +15,7 @@ Loading plugin MG5aMC_PLUGIN.CUDACPP_OUTPUT
 *                   *        * *        *                  *
 *                 *                       *                *
 *                                                          *
-*         VERSION 3.6.0                 2024-09-30         *
+*         VERSION 3.6.3                 2025-06-12         *
 [1;31m*                                                          *[1;0m
 [1;31m*          WARNING: UNKNOWN DEVELOPMENT VERSION.           *[1;0m
 [1;31m*            WARNING: DO NOT USE FOR PRODUCTION            *[1;0m
@@ -49,12 +49,15 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
+import /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 import model MSSM_SLHA2
+INFO: load particles 
+INFO: load vertices 
+[1;32mDEBUG: model prefixing  takes 0.9217190742492676 [0m
 INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . 
 INFO: Detect SLHA2 format. keeping restricted parameter in the param_card 
 [1;32mDEBUG: Simplifying conditional expressions [0m
@@ -550,45 +553,45 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.118 s
+1 processes with 3 diagrams generated in 0.097 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 171][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 176][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 175][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 180][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 218][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 219][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 220][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 221][0m [0m
-[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 222][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 222][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 223][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 224][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 225][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'color_sum.h', 'diagram_boilerplate.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'MemoryAccessChannelIds.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'valgrind.h', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 226][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.136 s
+ALOHA: aloha creates 2 routines in  0.131 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
-INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h
+INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
-FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h
+FileWriter <class 'MG5aMC_PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc
 INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
+INFO: /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2025/test-madgraph4gpu/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. 
 quit
 
-real	0m1.291s
-user	0m1.202s
+real	0m2.336s
+user	0m2.218s
 sys	0m0.072s
 Code generation completed in 2 seconds
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
index 87aa648dd2..a45024704a 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef BRIDGE_H
 #define BRIDGE_H 1
@@ -255,11 +255,15 @@ namespace mg5amcCpu
         throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen!
       m_gpublocks = m_nevt / m_gputhreads;
     }
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devChannelIds, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) );
 #else
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl;
+#endif
     m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
@@ -290,8 +294,10 @@ namespace mg5amcCpu
       throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" );
     m_gpublocks = gpublocks;
     m_gputhreads = gputhreads;
+#ifdef MGONGPUCPP_VERBOSE
     std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads
               << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl;
+#endif
     m_pmek->setGrid( m_gpublocks, m_gputhreads );
   }
 #endif
@@ -347,7 +353,9 @@ namespace mg5amcCpu
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
     copyHostFromDevice( m_hstMEs, m_devMEs );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     copyHostFromDevice( m_hstSelHel, m_devSelHel );
     copyHostFromDevice( m_hstSelCol, m_devSelCol );
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
@@ -400,7 +408,9 @@ namespace mg5amcCpu
     }
     if( goodHelOnly ) return;
     m_pmek->computeMatrixElements( useChannelIds );
+#ifdef MGONGPUCPP_VERBOSE
     flagAbnormalMEs( m_hstMEs.data(), m_nevt );
+#endif
     if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
     {
       memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
index 1afb14d668..8a37d1f947 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h
@@ -1,17 +1,23 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPUABSTRACTION_H
 #define MG5AMC_GPUABSTRACTION_H 1
 
+#include "mgOnGpuConfig.h"
+
 #include <cassert>
 
 //--------------------------------------------------------------------------
 
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "cublas_v2.h"
+#endif
+
 #define gpuError_t cudaError_t
 #define gpuPeekAtLastError cudaPeekAtLastError
 #define gpuGetErrorString cudaGetErrorString
@@ -21,24 +27,61 @@
 #define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( cudaMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( cudaFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( cudaGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice cudaSetDevice
 #define gpuDeviceSynchronize cudaDeviceSynchronize
 #define gpuDeviceReset cudaDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t cudaStream_t
+#define gpuStreamCreate( pStream ) checkGpu( cudaStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( cudaStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t cublasStatus_t
+#define GPUBLAS_STATUS_SUCCESS CUBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t cublasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate cublasCreate
+#define gpuBlasDestroy cublasDestroy
+#define gpuBlasSetStream cublasSetStream
+
+#define gpuBlasSaxpy cublasSaxpy
+#define gpuBlasSdot cublasSdot
+#define gpuBlasSgemv cublasSgemv
+#define gpuBlasSgemm cublasSgemm
+#define gpuBlasSgemmStridedBatched cublasSgemmStridedBatched
+#define gpuBlasDaxpy cublasDaxpy
+#define gpuBlasDdot cublasDdot
+#define gpuBlasDgemv cublasDgemv
+#define gpuBlasDgemm cublasDgemm
+#define gpuBlasDgemmStridedBatched cublasDgemmStridedBatched
+#define GPUBLAS_OP_N CUBLAS_OP_N
+#define GPUBLAS_OP_T CUBLAS_OP_T
 
 //--------------------------------------------------------------------------
 
 #elif defined __HIPCC__
 
+#ifndef MGONGPU_HAS_NO_BLAS
+#include "hipblas/hipblas.h"
+#endif
+
 #define gpuError_t hipError_t
 #define gpuPeekAtLastError hipPeekAtLastError
 #define gpuGetErrorString hipGetErrorString
@@ -48,22 +91,69 @@
 #define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) )
 
 #define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) )
+#define gpuMemset( data, value, bytes ) checkGpu( hipMemset( data, value, bytes ) )
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) )
 
 #define gpuFree( ptr ) checkGpu( hipFree( ptr ) )
 #define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) )
 
+#define gpuGetSymbolAddress( devPtr, symbol ) checkGpu( hipGetSymbolAddress( devPtr, symbol ) )
+
 #define gpuSetDevice hipSetDevice
 #define gpuDeviceSynchronize hipDeviceSynchronize
 #define gpuDeviceReset hipDeviceReset
 
 #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<<blocks, threads>>>( __VA_ARGS__ )
-#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_ARGS__ )
+//#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<<blocks, threads, sharedMem>>>( __VA_>
+#define gpuLaunchKernelStream( kernel, blocks, threads, stream, ... ) kernel<<<blocks, threads, 0, stream>>>( __VA_ARGS__ )
+
+#define gpuStream_t hipStream_t
+#define gpuStreamCreate( pStream ) checkGpu( hipStreamCreate( pStream ) )
+#define gpuStreamDestroy( stream ) checkGpu( hipStreamDestroy( stream ) )
+
+#define gpuBlasStatus_t hipblasStatus_t
+#define GPUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#ifndef MGONGPU_HAS_NO_BLAS
+#define gpuBlasHandle_t hipblasHandle_t
+#else
+#define gpuBlasHandle_t void // hack to keep the same API also in noBLAS builds
+#endif
+#define gpuBlasCreate hipblasCreate
+#define gpuBlasDestroy hipblasDestroy
+#define gpuBlasSetStream hipblasSetStream
+
+#define gpuBlasSaxpy hipblasSaxpy
+#define gpuBlasSdot hipblasSdot
+#define gpuBlasSgemv hipblasSgemv
+#define gpuBlasSgemm hipblasSgemm
+#define gpuBlasSgemmStridedBatched hipblasSgemmStridedBatched
+#define gpuBlasDaxpy hipblasDaxpy
+#define gpuBlasDdot hipblasDdot
+#define gpuBlasDgemv hipblasDgemv
+#define gpuBlasDgemm hipblasDgemm
+#define gpuBlasDgemmStridedBatched hipblasDgemmStridedBatched
+#define GPUBLAS_OP_N HIPBLAS_OP_N
+#define GPUBLAS_OP_T HIPBLAS_OP_T
+
+#endif
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPU_FPTYPE2_FLOAT
+#define gpuBlasTaxpy gpuBlasSaxpy
+#define gpuBlasTdot gpuBlasSdot
+#define gpuBlasTgemv gpuBlasSgemv
+#define gpuBlasTgemm gpuBlasSgemm
+#define gpuBlasTgemmStridedBatched gpuBlasSgemmStridedBatched
+#else
+#define gpuBlasTaxpy gpuBlasDaxpy
+#define gpuBlasTdot gpuBlasDdot
+#define gpuBlasTgemv gpuBlasDgemv
+#define gpuBlasTgemm gpuBlasDgemm
+#define gpuBlasTgemmStridedBatched gpuBlasDgemmStridedBatched
 #endif
 
 #endif // MG5AMC_GPUABSTRACTION_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h
index 860c7fde16..086aa6a616 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MG5AMC_GPURUNTIME_H
 #define MG5AMC_GPURUNTIME_H 1
@@ -30,6 +30,22 @@ inline void assertGpu( gpuError_t code, const char* file, int line, bool abort =
 
 //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+#define checkGpuBlas( code ){ assertGpuBlas( code, __FILE__, __LINE__ ); }
+inline void assertGpuBlas( gpuBlasStatus_t code, const char *file, int line, bool abort = true )
+{
+  if ( code != GPUBLAS_STATUS_SUCCESS )
+  {
+    printf( "ERROR! assertGpuBlas: '%d' in %s:%d\n", code, file, line );
+    if( abort ) assert( code == GPUBLAS_STATUS_SUCCESS );
+  }
+}
+#endif
+#endif /* clang-format on */
+
+//--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
 {
@@ -50,7 +66,7 @@ namespace mg5amcGpu
     // Set up CUDA application
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization
-    static void setUp( const bool debug = true )
+    static void setUp( const bool debug = false ) // ZW: changed debug default to false
     {
       // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization
       // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer!
@@ -71,7 +87,7 @@ namespace mg5amcGpu
     // ** NB: strictly speaking this is not needed when using the CUDA runtime API **
     // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck
     // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking
-    static void tearDown( const bool debug = true )
+    static void tearDown( const bool debug = false ) // ZW: changed debug default to false
     {
       if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl;
       checkGpu( gpuDeviceReset() );
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index f463977c1a..a68ae314eb 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #include "MatrixElementKernels.h"
 
@@ -60,7 +60,9 @@ namespace mg5amcCpu
 #ifdef MGONGPU_CHANNELID_DEBUG
     MatrixElementKernelBase::dumpNevtProcessedByChannel();
 #endif
+#ifdef MGONGPUCPP_VERBOSE
     MatrixElementKernelBase::dumpSignallingFPEs();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -164,7 +166,7 @@ namespace mg5amcCpu
     , m_denominators( nevt )
 #endif
   {
-    //std::cout << "DEBUG: MatrixElementKernelHost ctor " << this << std::endl;
+    //std::cout << "DEBUG: MatrixElementKernelHost::ctor " << this << std::endl;
     if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" );
     if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" );
     if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: channelIds must be a device array" );
@@ -191,14 +193,14 @@ namespace mg5amcCpu
 
   MatrixElementKernelHost::~MatrixElementKernelHost()
   {
+    //std::cout << "DEBUG: MatrixElementKernelBase::dtor " << this << std::endl;
   }
 
   //--------------------------------------------------------------------------
 
   int MatrixElementKernelHost::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    HostBufferHelicityMask hstIsGoodHel( ncomb );
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
     // ... 0d1. Compute good helicity mask on the host
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -206,7 +208,7 @@ namespace mg5amcCpu
 #else
     sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() );
 #endif
-    // ... 0d2. Copy back good helicity list to static memory on the host
+    // ... 0d2. Copy good helicity list to static memory on the host
     // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?]
     return sigmaKin_setGoodHel( hstIsGoodHel.data() );
   }
@@ -218,10 +220,10 @@ namespace mg5amcCpu
     computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() );
 #else
     assert( useChannelIds == false );
-    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelHost::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -312,16 +314,28 @@ namespace mg5amcGpu
     : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
     , NumberOfEvents( gpublocks * gputhreads )
     , m_couplings( this->nevt() )
+    , m_pHelMEs()
+    , m_pHelJamps()
+    , m_pHelWfs()
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( this->nevt() )
-    , m_denominators( this->nevt() )
+    , m_pHelNumerators()
+    , m_pHelDenominators()
+    , m_colJamp2s( CPPProcess::ncolor * this->nevt() )
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     , m_hstChannelIds( this->nevt() )
 #endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    , m_blasColorSum( false )
+    , m_blasTf32Tensor( false )
+    , m_pHelBlasTmp()
+    , m_helBlasHandles()
+#endif
+    , m_helStreams()
     , m_gpublocks( gpublocks )
     , m_gputhreads( gputhreads )
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::ctor " << this << std::endl;
     if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" );
     if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" );
     if( !m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: channelIds must be a device array" ); // FIXME?!
@@ -339,12 +353,82 @@ namespace mg5amcGpu
       sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM;
       throw std::runtime_error( sstr.str() );
     }
+    // Create the "one-helicity" jamp buffer that will be used for helicity filtering
+    m_pHelJamps.reset( new DeviceBufferSimple( CPPProcess::ncolor * mgOnGpu::nx2 * this->nevt() ) );
+    // Create the "one-helicity" wavefunction buffer that will be used for helicity filtering
+    m_pHelWfs.reset( new DeviceBufferSimple( CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * this->nevt() ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Create the "one-helicity" numerator and denominator buffers that will be used for helicity filtering
+    m_pHelNumerators.reset( new DeviceBufferSimple( this->nevt() ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( this->nevt() ) );
+#endif
+    // Decide at runtime whether to use BLAS for color sums
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      // Analyse environment variable CUDACPP_RUNTIME_BLASCOLORSUM
+      const char* blasEnv = getenv( "CUDACPP_RUNTIME_BLASCOLORSUM" );
+      if( blasEnv && std::string( blasEnv ) != "" )
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        m_blasColorSum = true; // fixme? eventually set default=true and decode "Y" and "N" choices?
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty: enable BLAS" << std::endl;
+#else
+        throw std::runtime_error( "Env variable CUDACPP_RUNTIME_BLASCOLORSUM is set and non-empty, but BLAS was disabled at build time" );
+#endif
+      }
+      else
+      {
+#ifndef MGONGPU_HAS_NO_BLAS
+        std::cout << "INFO: Env variable CUDACPP_RUNTIME_BLASCOLORSUM is empty or not set: disable BLAS" << std::endl;
+#else
+        std::cout << "INFO: BLAS was disabled at build time" << std::endl;
+#endif
+      }
+#ifndef MGONGPU_HAS_NO_BLAS
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+      // Analyse environment variable CUDACPP_RUNTIME_CUBLASTF32TENSOR
+      const char* blasEnv2 = getenv( "CUDACPP_RUNTIME_CUBLASTF32TENSOR" );
+      if( blasEnv2 && std::string( blasEnv2 ) != "" )
+      {
+        if( m_blasColorSum )
+        {
+#ifdef MGONGPU_FPTYPE2_FLOAT
+          m_blasTf32Tensor = true;
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty: enable CUBLAS_TF32_TENSOR_OP_MATH" << std::endl;
+#else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but color sums use FP64" << std::endl;
+#endif
+        }
+        else
+          std::cout << "WARNING! Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is set and non-empty, but BLAS was disabled at runtime" << std::endl;
+      }
+#ifdef MGONGPU_FPTYPE2_FLOAT
+      else
+      {
+        if( m_blasColorSum )
+          std::cout << "INFO: Env variable CUDACPP_RUNTIME_CUBLASTF32TENSOR is empty or not set: keep cuBLAS math defaults" << std::endl;
+      }
+#endif
+#endif
+#endif
+    }
   }
 
   //--------------------------------------------------------------------------
 
   MatrixElementKernelDevice::~MatrixElementKernelDevice()
   {
+    //std::cout << "DEBUG: MatrixElementKernelDevice::dtor " << this << std::endl;
+    for( int ihel = 0; ihel < CPPProcess::ncomb; ihel++ )
+    {
+#ifndef MGONGPU_HAS_NO_BLAS
+      if( m_helBlasHandles[ihel] ) gpuBlasDestroy( m_helBlasHandles[ihel] );
+#endif
+      if( m_helStreams[ihel] ) gpuStreamDestroy( m_helStreams[ihel] ); // do not destroy if nullptr
+    }
   }
 
   //--------------------------------------------------------------------------
@@ -361,21 +445,61 @@ namespace mg5amcGpu
 
   int MatrixElementKernelDevice::computeGoodHelicities()
   {
-    constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations
-    PinnedHostBufferHelicityMask hstIsGoodHel( ncomb );
-    DeviceBufferHelicityMask devIsGoodHel( ncomb );
-    // ... 0d1. Compute good helicity mask on the device
+    PinnedHostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    // ... 0d1. Compute good helicity mask (a host variable) on the device
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
+    const int nevt = m_gpublocks * m_gputhreads;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #else
-    gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() );
+    sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_pHelJamps->data(), m_pHelWfs->data(), hstIsGoodHel.data(), nevt );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    // ... 0d2. Copy back good helicity mask to the host
-    copyHostFromDevice( hstIsGoodHel, devIsGoodHel );
-    // ... 0d3. Copy back good helicity list to constant memory on the device
-    return sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    // ... 0d3. Set good helicity list in host static memory
+    int nGoodHel = sigmaKin_setGoodHel( hstIsGoodHel.data() );
+    assert( nGoodHel > 0 ); // SANITY CHECK: there should be at least one good helicity
+    // Create one GPU stream for each good helicity
+    for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      gpuStreamCreate( &m_helStreams[ighel] );
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create one cuBLAS/hipBLAS handle for each good helicity
+    // Attach a different stream to each cuBLAS/hipBLAS handle
+    if( m_blasColorSum )
+      for( int ighel = 0; ighel < nGoodHel; ighel++ )
+      {
+        checkGpuBlas( gpuBlasCreate( &m_helBlasHandles[ighel] ) );
+        checkGpuBlas( gpuBlasSetStream( m_helBlasHandles[ighel], m_helStreams[ighel] ) );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+        if( m_blasTf32Tensor )
+          checkGpuBlas( cublasSetMathMode( m_helBlasHandles[ighel], CUBLAS_TF32_TENSOR_OP_MATH ) ); // enable TF32 tensor cores
+#endif
+      }
+#endif
+    // ... Create the "many-helicity" super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    m_pHelMEs.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelJamps.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+    // ... Create the "many-helicity" super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelWfs.reset( new DeviceBufferSimple( nGoodHel * CPPProcess::nwf * CPPProcess::nw6 * mgOnGpu::nx2 * nevt ) );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // ... Create the "many-helicity" super-buffers of nGoodHel numerator and denominator buffers (dynamically allocated)
+    // ... (calling reset here deletes the previously created "one-helicity" buffers used for helicity filtering)
+    m_pHelNumerators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+    m_pHelDenominators.reset( new DeviceBufferSimple( nGoodHel * nevt ) );
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Create the "many-helicity" super-buffers of real/imag ncolor*nevt temporary buffers for cuBLAS/hipBLAS intermediate results in color_sum_blas
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * ( 2 * CPPProcess::ncolor * mgOnGpu::nx2 + 1 ) * nevt ) );
+#else
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer per good helicity
+    if( m_blasColorSum ) m_pHelBlasTmp.reset( new DeviceBufferSimple2( nGoodHel * CPPProcess::ncolor * mgOnGpu::nx2 * nevt ) );
+#endif
+#endif
+    // Return the number of good helicities
+    return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
@@ -383,17 +507,19 @@ namespace mg5amcGpu
   void MatrixElementKernelDevice::computeMatrixElements( const bool useChannelIds )
   {
     gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() );
-#ifndef MGONGPU_NSIGHT_DEBUG
-    constexpr unsigned int sharedMemSize = 0;
+#ifndef MGONGPU_HAS_NO_BLAS
+    fptype2* ghelAllBlasTmp = ( m_blasColorSum ? m_pHelBlasTmp->data() : nullptr );
+    gpuBlasHandle_t* ghelBlasHandles = ( m_blasColorSum ? m_helBlasHandles : nullptr );
 #else
-    constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float );
+    fptype2* ghelAllBlasTmp = nullptr;
+    gpuBlasHandle_t* ghelBlasHandles = nullptr;
 #endif
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), pChannelIds, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_colJamp2s.data(), m_pHelNumerators->data(), m_pHelDenominators->data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #else
     assert( useChannelIds == false );
-    gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() );
+    sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), m_pHelMEs->data(), m_pHelJamps->data(), m_pHelWfs->data(), ghelAllBlasTmp, ghelBlasHandles, m_helStreams, m_gpublocks, m_gputhreads );
 #endif
 #ifdef MGONGPU_CHANNELID_DEBUG
     //std::cout << "DEBUG: MatrixElementKernelDevice::computeMatrixElements " << this << " " << ( useChannelIds ? "T" : "F" ) << " " << nevt() << std::endl;
@@ -401,8 +527,8 @@ namespace mg5amcGpu
     const unsigned int* pHstChannelIds = ( useChannelIds ? m_hstChannelIds.data() : nullptr );
     MatrixElementKernelBase::updateNevtProcessedByChannel( pHstChannelIds, nevt() );
 #endif
-    checkGpu( gpuPeekAtLastError() );
-    checkGpu( gpuDeviceSynchronize() );
+    checkGpu( gpuPeekAtLastError() );   // is this needed?
+    checkGpu( gpuDeviceSynchronize() ); // probably not needed? but it avoids errors in sigmaKin above from appearing later on in random places...
   }
 
   //--------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 7acff4b308..c901874333 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -1,16 +1,19 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi, Z. Wettersten (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MATRIXELEMENTKERNELS_H
 #define MATRIXELEMENTKERNELS_H 1
 
 #include "mgOnGpuConfig.h"
 
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
 #include "MemoryBuffers.h"
 
 #include <map>
+#include <memory>
 
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -134,7 +137,7 @@ namespace mg5amcCpu
 
     // Does this host system support the SIMD used in the matrix element calculation?
     // [NB: this is private, SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!]
-    static bool hostSupportsSIMD( const bool verbose = true );
+    static bool hostSupportsSIMD( const bool verbose = false ); // ZW: default verbose false
 
   private:
 
@@ -191,12 +194,24 @@ namespace mg5amcCpu
     // The buffer for the event-by-event couplings that depends on alphas QCD
     DeviceBufferCouplings m_couplings;
 
+    // The super-buffer of nGoodHel ME buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelMEs;
+
+    // The super-buffer of nGoodHel jamp buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelJamps;
+
+    // The super-buffer of nGoodHel wavefunction buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelWfs;
+
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    DeviceBufferNumerators m_numerators;
+    // The super-buffer of nGoodHel numerator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelNumerators;
 
-    // The buffer for the event-by-event denominators of multichannel factors
-    DeviceBufferDenominators m_denominators;
+    // The super-buffer of nGoodHel denominator buffers (dynamically allocated because nGoodHel is determined at runtime)
+    std::unique_ptr<DeviceBufferSimple> m_pHelDenominators;
+
+    // The super-buffer of ncolor jamp2 buffers
+    DeviceBufferSimple m_colJamp2s;
 #endif
 
 #ifdef MGONGPU_CHANNELID_DEBUG
@@ -205,6 +220,23 @@ namespace mg5amcCpu
     PinnedHostBufferChannelIds m_hstChannelIds;
 #endif
 
+#ifndef MGONGPU_HAS_NO_BLAS
+    // Decide at runtime whether to use BLAS for color sums
+    bool m_blasColorSum;
+
+    // Decide at runtime whether TF32TENSOR math should be used in cuBLAS
+    bool m_blasTf32Tensor;
+
+    // The super-buffer of nGoodHel cuBLAS/hipBLAS temporary buffers
+    std::unique_ptr<DeviceBufferSimple2> m_pHelBlasTmp;
+
+    // The array of cuBLAS/hipBLAS handles (one for each good helicity)
+    gpuBlasHandle_t m_helBlasHandles[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+#endif
+
+    // The array of GPU streams (one for each good helicity)
+    gpuStream_t m_helStreams[CPPProcess::ncomb]; // reserve ncomb streams (but only nGoodHel <= ncomb will be used)
+
     // The number of blocks in the GPU grid
     size_t m_gpublocks;
 
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
index 0d92f69c43..a49f041e05 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessAmplitudes_H
 #define MemoryAccessAmplitudes_H 1
@@ -10,10 +10,6 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_AMPLITUDES 1
-
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
 namespace mg5amcGpu
@@ -23,120 +19,11 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  // A class describing the internal layout of memory buffers for amplitudes
-  // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessAmplitudesBase //_AOSOAv1
-  {
-  public:
-
-    // Number of Events Per Page in the amplitude AOSOA memory buffer layout
-    static constexpr int neppA = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessAmplitudesBase>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, true>;
-    friend class KernelAccessHelper<MemoryAccessAmplitudesBase, false>;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
-    {
-      const int ipagA = ievt / neppA; // #event "A-page"
-      const int ieppA = ievt % neppA; // #event in the current event A-page
-      constexpr int ix2 = 0;
-      return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA]
-    }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int ix2 )
-    {
-      constexpr int ipagA = 0;
-      constexpr int ieppA = 0;
-      return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA]
-    }
-  };
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessAmplitudesBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2 = MemoryAccessHelper<MemoryAccessAmplitudesBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto decodeRecordIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template decodeRecordConst<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2 =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===]
-    static constexpr auto ieventAccessIx2Const =
-      MemoryAccessHelper<MemoryAccessAmplitudesBase>::template ieventAccessFieldConst<int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
+  // A class providing trivial access to amplitude memory buffers
   template<bool onDevice>
   class KernelAccessAmplitudes
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_AMPLITUDES
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2 =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessField<int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===]
-    static constexpr auto kernelAccessIx2Const =
-      KernelAccessHelper<MemoryAccessAmplitudesBase, onDevice>::template kernelAccessFieldConst<int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
@@ -148,8 +35,6 @@ namespace mg5amcCpu
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES
   };
 
   //----------------------------------------------------------------------------
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
index 3802fa57c0..26345d4b43 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessCouplings_H
 #define MemoryAccessCouplings_H 1
@@ -235,7 +235,7 @@ namespace mg5amcCpu
       /*
       fptype_sv& real = kernelAccessIx2( buffer, 0 );
       fptype_sv& imag = kernelAccessIx2( buffer, 1 );
-      printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv_ref( real, imag );
       */
       return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ),
@@ -250,7 +250,7 @@ namespace mg5amcCpu
       /*
       const fptype_sv& real = kernelAccessIx2Const( buffer, 0 );
       const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 );
-      printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
+      printf( "CD_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag );
       return cxtype_sv( real, imag );
       */
       return cxtype_sv( kernelAccessIx2Const( buffer, 0 ),
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
index 9f4c620bc7..bbffc1fb36 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryAccessWavefunctions_H
 #define MemoryAccessWavefunctions_H 1
@@ -10,9 +10,7 @@
 
 #include "mgOnGpuCxtypes.h"
 
-#include "MemoryAccessHelpers.h"
-
-#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1
+#include "CPPProcess.h"
 
 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725)
 #ifdef MGONGPUCPP_GPUIMPL
@@ -23,147 +21,44 @@ namespace mg5amcCpu
 {
   //----------------------------------------------------------------------------
 
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  // A class describing the internal layout of memory buffers for wavefunctions
-  // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW
-  // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name]
-  class MemoryAccessWavefunctionsBase //_AOSOAv1
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessWavefunctions
   {
   public:
-
-    // Number of Events Per Page in the wavefunction AOSOA memory buffer layout
-    static constexpr int neppW = 1; // AOS (just a test...)
-
-  private:
-
-    friend class MemoryAccessHelper<MemoryAccessWavefunctionsBase>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, true>;
-    friend class KernelAccessHelper<MemoryAccessWavefunctionsBase, false>;
-
-    // The number of components of a (fermion or vector) wavefunction
-    static constexpr int nw6 = mgOnGpu::nw6;
-
-    // The number of floating point components of a complex number
-    static constexpr int nx2 = mgOnGpu::nx2;
-
-    //--------------------------------------------------------------------------
-    // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )"
-    // (in other words: first locate the event record for a given event, then locate an element in that record)
-    //--------------------------------------------------------------------------
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static __host__ __device__ inline fptype*
-    ieventAccessRecord( fptype* buffer,
-                        const int ievt )
+    static __host__ __device__ inline cxtype_sv*
+    kernelAccess( fptype* buffer )
     {
-      const int ipagW = ievt / neppW; // #event "W-page"
-      const int ieppW = ievt % neppW; // #event in the current event W-page
-      constexpr int iw6 = 0;
-      constexpr int ix2 = 0;
-      return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
-
-    //--------------------------------------------------------------------------
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===]
-    // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"]
-    static __host__ __device__ inline fptype&
-    decodeRecord( fptype* buffer,
-                  const int iw6,
-                  const int ix2 )
+    static __host__ __device__ inline const cxtype_sv*
+    kernelAccessConst( const fptype* buffer )
     {
-      constexpr int ipagW = 0;
-      constexpr int ieppW = 0;
-      return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW]
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return reinterpret_cast<const cxtype*>( buffer + ievt * CPPProcess::nw6 * mgOnGpu::nx2 );
     }
   };
+#endif
 
   //----------------------------------------------------------------------------
 
-  // A class providing access to memory buffers for a given event, based on explicit event numbers
-  // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations
-  class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase
-  {
-  public:
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecord = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecord;
-
-    // Locate an event record (output) in a memory buffer (input) from the given event number (input)
-    // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===]
-    static constexpr auto ieventAccessRecordConst = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::ieventAccessRecordConst;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper<MemoryAccessWavefunctionsBase>::decodeRecord;
-
-    // Locate a field (output) of an event record (input) from the given field indexes (input)
-    // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto decodeRecordIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template decodeRecordConst<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2 =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===]
-    static constexpr auto ieventAccessIw6Ix2Const =
-      MemoryAccessHelper<MemoryAccessWavefunctionsBase>::template ieventAccessFieldConst<int, int>;
-  };
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-  //----------------------------------------------------------------------------
-
-  // A class providing access to memory buffers for a given event, based on implicit kernel rules
-  // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations
-  template<bool onDevice>
-  class KernelAccessWavefunctions
+  class HostAccessWavefunctions
   {
   public:
-
-#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2 =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessField<int, int>;
-
-    // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input)
-    // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===]
-    static constexpr auto kernelAccessIw6Ix2Const =
-      KernelAccessHelper<MemoryAccessWavefunctionsBase, onDevice>::template kernelAccessFieldConst<int, int>;
-
-#else
-
     static __host__ __device__ inline cxtype_sv*
     kernelAccess( fptype* buffer )
     {
       return reinterpret_cast<cxtype_sv*>( buffer );
     }
-
     static __host__ __device__ inline const cxtype_sv*
     kernelAccessConst( const fptype* buffer )
     {
       return reinterpret_cast<const cxtype_sv*>( buffer );
     }
-
-#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS
   };
 
   //----------------------------------------------------------------------------
 
-  typedef KernelAccessWavefunctions<false> HostAccessWavefunctions;
-  typedef KernelAccessWavefunctions<true> DeviceAccessWavefunctions;
-
-  //----------------------------------------------------------------------------
-
 } // end namespace mg5amcGpu/mg5amcCpu
 
 #endif // MemoryAccessWavefunctions_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
index 5bd3053393..0ddc356e1a 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MemoryBuffers_H
 #define MemoryBuffers_H 1
@@ -34,6 +34,7 @@ namespace mg5amcCpu
     static constexpr size_t nparf = CPPProcess::nparf;
     static constexpr size_t npar = CPPProcess::npar;
     static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;
+    static constexpr size_t ncolor = CPPProcess::ncolor;
   }
 
   //--------------------------------------------------------------------------
@@ -69,8 +70,8 @@ namespace mg5amcCpu
   protected:
     BufferBase( const size_t size, const bool onDevice )
       : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {}
-    virtual ~BufferBase() {}
   public:
+    virtual ~BufferBase() {}
     T* data() { return m_data; }
     const T* data() const { return m_data; }
     T& operator[]( const size_t index ) { return m_data[index]; }
@@ -167,8 +168,14 @@ namespace mg5amcCpu
   public:
     HostBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt ) {}
-    virtual ~HostBuffer() {}
+      , HostBufferBase<T, ismisaligned>( sizePerEvent * nevt )
+    {
+      //std::cout << "HostBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~HostBuffer()
+    {
+      //std::cout << "HostBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
@@ -194,19 +201,33 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating a CUDA device buffer for a given number of events
   template<typename T, size_t sizePerEvent>
-  class DeviceBuffer : public DeviceBufferBase<T>, virtual private NumberOfEvents
+  class DeviceBuffer : public DeviceBufferBase<T>, virtual protected NumberOfEvents
   {
   public:
     DeviceBuffer( const size_t nevt )
       : NumberOfEvents( nevt )
-      , DeviceBufferBase<T>( sizePerEvent * nevt ) {}
-    virtual ~DeviceBuffer() {}
+      , DeviceBufferBase<T>( sizePerEvent * nevt )
+    {
+      //std::cout << "DeviceBuffer::ctor " << this << " " << nevt << std::endl;
+    }
+    virtual ~DeviceBuffer()
+    {
+      //std::cout << "DeviceBuffer::dtor " << this << std::endl;
+    }
     virtual size_t nevt() const override final { return NumberOfEvents::nevt(); }
   };
 #endif
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // A class encapsulating a simple CUDA device buffer managed on an ad-hoc basis
+  typedef DeviceBuffer<fptype, 1> DeviceBufferSimple;
+  typedef DeviceBuffer<fptype2, 1> DeviceBufferSimple2;
+#endif
+
+  //--------------------------------------------------------------------------
+
   // A base class encapsulating a memory buffer for momenta random numbers
   typedef BufferBase<fptype> BufferRndNumMomenta;
 
@@ -277,12 +298,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventNumerators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for numerators
   typedef HostBuffer<fptype, sizePerEventNumerators, HostBufferALIGNED> HostBufferNumerators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for numerators
   typedef PinnedHostBuffer<fptype, sizePerEventNumerators> PinnedHostBufferNumerators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for numerators
   typedef DeviceBuffer<fptype, sizePerEventNumerators> DeviceBufferNumerators;
 #endif
 #endif
@@ -297,12 +318,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventDenominators = 1;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for denominators
   typedef HostBuffer<fptype, sizePerEventDenominators, HostBufferALIGNED> HostBufferDenominators;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for denominators
   typedef PinnedHostBuffer<fptype, sizePerEventDenominators> PinnedHostBufferDenominators;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for denominators
   typedef DeviceBuffer<fptype, sizePerEventDenominators> DeviceBufferDenominators;
 #endif
 #endif
@@ -316,12 +337,12 @@ namespace mg5amcCpu
   constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2;
 
 #ifndef MGONGPUCPP_GPUIMPL
-  // A class encapsulating a C++ host buffer for gs
+  // A class encapsulating a C++ host buffer for couplings
   typedef HostBuffer<fptype, sizePerEventCouplings, HostBufferALIGNED> HostBufferCouplings;
 #else
-  // A class encapsulating a CUDA pinned host buffer for gs
+  // A class encapsulating a CUDA pinned host buffer for couplings
   typedef PinnedHostBuffer<fptype, sizePerEventCouplings> PinnedHostBufferCouplings;
-  // A class encapsulating a CUDA device buffer for gs
+  // A class encapsulating a CUDA device buffer for couplings
   typedef DeviceBuffer<fptype, sizePerEventCouplings> DeviceBufferCouplings;
 #endif
 
@@ -505,6 +526,16 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifdef MGONGPUCPP_GPUIMPL
+  // The size (number of elements) per event in a memory buffer for jamps
+  constexpr size_t sizePerEventJamps = MemoryBuffers::ncolor * MemoryBuffers::nx2;
+
+  // A class encapsulating a CUDA device buffer for color selection
+  typedef DeviceBuffer<int, sizePerEventJamps> DeviceBufferJamps;
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   template<class Tdst, class Tsrc>
   void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
index 6867c6d67d..d79ea62148 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -16,6 +16,7 @@
 
 #include "mgOnGpuConfig.h"
 
+#include "GpuRuntime.h"
 #include "HelAmps_MSSM_SLHA2.h"
 #include "MemoryAccessAmplitudes.h"
 #include "MemoryAccessChannelIds.h"
@@ -25,6 +26,7 @@
 #include "MemoryAccessMatrixElements.h"
 #include "MemoryAccessMomenta.h"
 #include "MemoryAccessWavefunctions.h"
+#include "color_sum.h"
 
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
 #include "MemoryAccessDenominators.h"
@@ -96,20 +98,16 @@ namespace mg5amcGpu
 namespace mg5amcCpu
 #endif
 {
-  constexpr int nw6 = CPPProcess::nw6;     // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
-  constexpr int npar = CPPProcess::npar;   // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
-  constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
-
-  // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-  //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int nw6 = CPPProcess::nw6;       // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors)
+  constexpr int npar = CPPProcess::npar;     // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
+  constexpr int ncomb = CPPProcess::ncomb;   // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
+  constexpr int nwf = CPPProcess::nwf;       // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
 
   using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup;   // #couplings that vary event by event (depend on running alphas QCD)
   using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD)
 
-  // The number of colors
-  constexpr int ncolor = 2;
-
-  // The number of SIMD vectors of events processed by calculate_wavefunction
+  // The number of SIMD vectors of events processed by calculate_jamps
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
   constexpr int nParity = 2;
 #else
@@ -168,57 +166,112 @@ namespace mg5amcCpu
   // Helicity combinations (and filtering of "good" helicity combinations)
 #ifdef MGONGPUCPP_GPUIMPL
   __device__ __constant__ short cHel[ncomb][npar];
-  __device__ __constant__ int cNGoodHel;
-  __device__ __constant__ int cGoodHel[ncomb];
+  __device__ __constant__ int dcNGoodHel;
+  __device__ __constant__ int dcGoodHel[ncomb];
 #else
   static short cHel[ncomb][npar];
+#endif
   static int cNGoodHel;
   static int cGoodHel[ncomb];
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp2
+  {
+  public:
+    static __device__ inline fptype&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+    static __device__ inline const fptype&
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      return buffer[icol * nevt + ievt];
+    }
+  };
 #endif
 
   //--------------------------------------------------------------------------
 
-  // Evaluate |M|^2 for each subprocess
-  // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-  // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities)
-  // In CUDA, this device function computes the ME for a single event
-  // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
-  // *** NB: calculate_wavefunction accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898 ***
-  __device__ INLINE void /* clang-format off */
-  calculate_wavefunctions( int ihel,
-                           const fptype* allmomenta,      // input: momenta[nevt*npar*4]
-                           const fptype* allcouplings,    // input: couplings[nevt*ndcoup*2]
-                           fptype* allMEs,                // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPUCPP_GPUIMPL
+  __device__ INLINE unsigned int
+  gpu_channelId( const unsigned int* allChannelIds )
+  {
+    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                           const unsigned int channelId,  // input: multichannel SCALAR channelId (1 to #diagrams, 0 to disable SDE) for this event or SIMD vector
-                           fptype* allNumerators,         // output: multichannel numerators[nevt], running_sum_over_helicities
-                           fptype* allDenominators,       // output: multichannel denominators[nevt], running_sum_over_helicities
+    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    if( allChannelIds != nullptr )
+    {
+      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
+      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
+      // NB: channelIds_sv is a scalar in CUDA
+      channelId = channelIds_sv;
+      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
+    }
 #endif
-                           fptype_sv* jamp2_sv            // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled)
-#ifndef MGONGPUCPP_GPUIMPL
-                           , const int ievt00             // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+    return channelId;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#include "diagrams.h"
+
+  //--------------------------------------------------------------------------
+
+  // Evaluate QCD partial amplitudes jamps for this given helicity from Feynman diagrams
+  // Also compute running sums over helicities adding jamp2, numerator, denominator
+  // (NB: this function no longer handles matrix elements as the color sum has now been moved to a separate function/kernel)
+  // In CUDA, this function processes a single event
+  // ** NB1: NEW Nov2024! In CUDA this is now a kernel function (it used to be a device function)
+  // ** NB2: NEW Nov2024! in CUDA this now takes a channelId array as input (it used to take a scalar channelId as input)
+  // In C++, this function processes a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2)
+  // *** NB: in C++, calculate_jamps accepts a SCALAR channelId because it is GUARANTEED that all events in a SIMD vector have the same channelId #898
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   fptype* allJamps,                  // output: jamp[ncolor*2*nevt] for this helicity
+                   fptype* allWfs,                    // output: wf[nwf*nw6*2*nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE (#899/#911)
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   gpuStream_t gpustream,             // input: cuda stream for this helicity
+                   const int gpublocks,               // input: cuda gpublocks
+                   const int gputhreads )             // input: cuda gputhreads
+#else
+  INLINE void
+  calculate_jamps( int ihel,
+                   const fptype* allmomenta,          // input: momenta[nevt*npar*4]
+                   const fptype* allcouplings,        // input: couplings[nevt*ndcoup*2]
+                   cxtype_sv* jamp_sv,                // output: jamp_sv[ncolor] (f/d) or [2*ncolor] (m) for SIMD event page(s) ievt00 and helicity ihel
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                   const unsigned int channelId,      // input: SCALAR channelId (1 to #diagrams, 0 to disable SDE) for SIMD event page(s) ievt00
+                   fptype* allNumerators,             // input/output: multichannel numerators[nevt], add helicity ihel
+                   fptype* allDenominators,           // input/output: multichannel denominators[nevt], add helicity ihel
+#endif
+                   const int ievt00 )                 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
 #endif
-                           )
   //ALWAYS_INLINE // attributes are not permitted in a function definition
   {
 #ifdef MGONGPUCPP_GPUIMPL
-    using namespace mg5amcGpu;
-    using M_ACCESS = DeviceAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = DeviceAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = DeviceAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
-    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
     using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
     using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #else
-    using namespace mg5amcCpu;
     using M_ACCESS = HostAccessMomenta;         // non-trivial access: buffer includes all events
-    using E_ACCESS = HostAccessMatrixElements;  // non-trivial access: buffer includes all events
-    using W_ACCESS = HostAccessWavefunctions;   // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
-    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event
     using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
     using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
@@ -226,294 +279,139 @@ namespace mg5amcCpu
     using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
 #endif
 #endif /* clang-format on */
-    mgDebug( 0, __FUNCTION__ );
-    //bool debug = true;
-#ifndef MGONGPUCPP_GPUIMPL
-    //debug = ( ievt00 >= 64 && ievt00 < 80 && ihel == 3 ); // example: debug #831
-    //if( debug ) printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 );
-#endif
-    //if( debug ) printf( "calculate_wavefunctions: ihel=%d\n", ihel );
-
-    // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here
-    // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result...
-    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
+    // ----------------------------
+    // --- WAVEFUNCTION BUFFERS ---
+    // ----------------------------
+#ifndef MGONGPUCPP_GPUIMPL
     // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
     // [NB these variables are reused several times (and re-initialised each time) within the same event or event page]
-    // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need
-    // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)!
-    //MemoryBufferWavefunctions w_buffer[nwf]{ neppV };
+    // ** NB: wavefunctions only need TRIVIAL ACCESS in C++ code
     cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles)
-    cxtype_sv amp_sv[1];      // invariant amplitude for one given Feynman diagram
-
-    // Proof of concept for using fptype* in the interface
-    fptype* w_fp[nwf];
-    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast<fptype*>( w_sv[iwf] );
-    fptype* amp_fp;
-    amp_fp = reinterpret_cast<fptype*>( amp_sv );
-
-    // Local variables for the given CUDA event (ievt) or C++ event page (ipagV)
-    // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination]
-    cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!)
+    fptype* wfs = reinterpret_cast<fptype*>( w_sv );
+#else
+    // Global-memory variables for a subset of Feynman diagrams in the given CUDA event (ievt)
+    // ** NB: wavefunctions need non-trivial access in CUDA code because of kernel splitting
+    fptype* wfs = allWfs;
+#endif
 
     // === Calculate wavefunctions and amplitudes for all diagrams in all processes         ===
     // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ ===
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-    // Mixed fptypes #537: float for color algebra and double elsewhere
-    // Delay color algebra and ME updates (only on even pages)
-    cxtype_sv jamp_sv_previous[ncolor] = {};
-    fptype* MEs_previous = 0;
-#endif
+
+    // *****************************
+    // *** START LOOP ON IPARITY ***
+    // *****************************
     for( int iParity = 0; iParity < nParity; ++iParity )
-    { // START LOOP ON IPARITY
+    {
 #ifndef MGONGPUCPP_GPUIMPL
       const int ievt0 = ievt00 + iParity * neppV;
 #endif
-      //constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings (BUG #823)
-      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823)
-      const fptype* allCOUPs[nxcoup];
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
-#endif
-      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ )                             // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )                                 // FIX #823
-        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events
+
+      // -----------------
+      // --- COUPLINGS ---
+      // -----------------
 #ifdef MGONGPUCPP_GPUIMPL
-#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#pragma nv_diagnostic pop
-#endif
-      // CUDA kernels take input/output buffers with momenta/MEs for all events
-      const fptype* momenta = allmomenta;
-      const fptype* COUPs[nxcoup];
-      for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
-      fptype* MEs = allMEs;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = allNumerators;
-      fptype* denominators = allDenominators;
-#endif
+      // CUDA diagram kernels take input/output buffers with couplings "fptype* couplings" for all events
+      const fptype* couplings = allcouplings;
 #else
-      // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page)
-      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
+      // C++ diagram kernels take input/output buffers with couplings "fptype** COUPs" for a single event or SIMD vector
+      constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+      const fptype* allCOUPs[nxcoup];
       const fptype* COUPs[nxcoup];
+      // Dependent couplings, vary event-by-event
       for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
-        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event
-      //for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) // BUG #823
-      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ )     // FIX #823
-        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events
-      fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
-      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
-#endif
-#endif
-
-      // Reset color flows (reset jamp_sv) at the beginning of a new event or event page
-      for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); }
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
-      fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
-      fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+        allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+      // Dependent couplings, vary event-by-event
+      for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+        COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 );
+      // Independent couplings, fixed for all events
+      for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+        COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup];
 #endif
 
-      // *** DIAGRAM 1 OF 3 ***
-
-      // Wavefunction(s) for diagram number 1
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
-
-      vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
-
-      oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
-
-      ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
-
-      VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
-
-      // Amplitude(s) for diagram number 1
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // ---------------
+      // --- MOMENTA ---
+      // ---------------
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with momenta for all events
+      const fptype* momenta = allmomenta;
+#else
+      // C++ diagram kernels take input/output buffers with momenta for a single event or SIMD vector
+      const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 );
 #endif
-      jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0];
-      jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0];
-
-      // *** DIAGRAM 2 OF 3 ***
-
-      // Wavefunction(s) for diagram number 2
-      FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 2
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+      // -------------
+      // --- JAMPS ---
+      // -------------
+      // (Note: no need to 'reset color flows' i.e. zero allJamps, this is done in sigmaKin and sigmaKin_getGoodHel)
+#ifdef MGONGPUCPP_GPUIMPL
+      // In CUDA, write jamps to the output global-memory allJamps [for all events] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = allJamps;
+#else
+      // In C++, write jamps to the output array [for one specific event or SIMD vector] passed as argument
+      // (write directly to J_ACCESS::kernelAccessIcol( allJamps, icol ) instead of writing to jamp_sv[icol])
+      fptype* jamps = reinterpret_cast<fptype*>( iParity == 0 ? jamp_sv : &( jamp_sv[ncolor] ) );
 #endif
-      jamp_sv[0] -= amp_sv[0];
 
-      // *** DIAGRAM 3 OF 3 ***
-
-      // Wavefunction(s) for diagram number 3
-      FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
-
-      // Amplitude(s) for diagram number 3
-      FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
+      // ------------------
+      // --- CHANNELIDS ---
+      // ------------------
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
-#endif
-      jamp_sv[1] -= amp_sv[0];
-
-      // *** COLOR CHOICE BELOW ***
-      // Store the leading color flows for choice of color
-      if( jamp2_sv ) // disable color choice if nullptr
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp2_sv[ncolor * iParity + icol] += cxabs2( jamp_sv[icol] ); // may underflow #831
-
-      // *** COLOR MATRIX BELOW ***
-      // (This method used to be called CPPProcess::matrix_1_gg_ttx()?)
-
-      // The color denominators (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2]
-
-      // The color matrix (initialize all array elements, with ncolor=2)
-      // [NB do keep 'static' for these constexpr arrays, see issue #283]
-      static constexpr fptype2 cf[ncolor][ncolor] = {
-        { 16, -2 },
-        { -2, 16 } }; // 2-D array[2][2]
-
-#ifndef MGONGPUCPP_GPUIMPL
-      // Pre-compute a constexpr triangular color matrix properly normalized #475
-      struct TriangularNormalizedColorMatrix
-      {
-        // See https://stackoverflow.com/a/34465458
-        __host__ __device__ constexpr TriangularNormalizedColorMatrix()
-          : value()
-        {
-          for( int icol = 0; icol < ncolor; icol++ )
-          {
-            // Diagonal terms
-            value[icol][icol] = cf[icol][icol] / denom[icol];
-            // Off-diagonal terms
-            for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-              value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol];
-          }
-        }
-        fptype2 value[ncolor][ncolor];
-      };
-      static constexpr auto cf2 = TriangularNormalizedColorMatrix();
-#endif
-
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages
-      {
-        // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV
-        for( int icol = 0; icol < ncolor; icol++ )
-          jamp_sv_previous[icol] = jamp_sv[icol];
-        MEs_previous = MEs;
-        continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages
-      }
-      fptype_sv deltaMEs_previous = { 0 };
-#endif
-
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      // Sum and square the color flows to get the matrix element
-      // (compute |M|^2 by squaring |M|, taking into account colours)
-      fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes
-
-      // Use the property that M is a real matrix (see #475):
-      // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
-      // In addition, on C++ use the property that M is symmetric (see #475),
-      // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time:
-      // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
-      // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype2_sv jampR_sv[ncolor] = { 0 };
-      fptype2_sv jampI_sv[ncolor] = { 0 };
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) );
-        jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) );
-      }
-#endif
-      for( int icol = 0; icol < ncolor; icol++ )
-      {
-        //if( debug ) printf( "calculate_wavefunctions... icol=%d\n", icol );
-#ifndef MGONGPUCPP_GPUIMPL
-        // === C++ START ===
-        // Diagonal terms
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        fptype2_sv& jampRi_sv = jampR_sv[icol];
-        fptype2_sv& jampIi_sv = jampI_sv[icol];
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with channelIDs for all events
+      const unsigned int* channelIds = allChannelIds;
 #else
-        fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
-        fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+      // C++ diagram kernels take input/output buffers with a single SCALAR channelID for all events in a given SIMD vector
+      const unsigned int* channelIds = &channelId;
 #endif
-        fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
-        fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
-        // Off-diagonal terms
-        for( int jcol = icol + 1; jcol < ncolor; jcol++ )
-        {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-          fptype2_sv& jampRj_sv = jampR_sv[jcol];
-          fptype2_sv& jampIj_sv = jampI_sv[jcol];
 #else
-          fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
-          fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      const unsigned int* channelIds = nullptr;
 #endif
-          ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
-        }
-        fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-        deltaMEs_previous += fpvsplit0( deltaMEs2 );
-        deltaMEs += fpvsplit1( deltaMEs2 );
+
+      // -------------------------------
+      // --- NUMERATORS/DENOMINATORS ---
+      // -------------------------------
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#ifdef MGONGPUCPP_GPUIMPL
+      // CUDA diagram kernels take input/output buffers with numerators/denominators for all events
+      fptype* numerators = allNumerators;
+      fptype* denominators = allDenominators;
 #else
-        deltaMEs += deltaMEs2;
+      // C++ diagram kernels take input/output buffers with numerators/denominators for a single event or SIMD vector
+      fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 );
+      fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 );
 #endif
-        // === C++ END ===
 #else
-        // === CUDA START ===
-        fptype2_sv ztempR_sv = { 0 };
-        fptype2_sv ztempI_sv = { 0 };
-        for( int jcol = 0; jcol < ncolor; jcol++ )
-        {
-          fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] );
-          fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] );
-          ztempR_sv += cf[icol][jcol] * jampRj_sv;
-          ztempI_sv += cf[icol][jcol] * jampIj_sv;
-        }
-        deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol];
-        // === CUDA END ===
+      // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+      // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+      fptype* numerators = nullptr;
+      fptype* denominators = nullptr;
 #endif
-      }
 
-      // *** STORE THE RESULTS ***
+      // ------------------------
+      // --- FEYNMAN DIAGRAMS ---
+      // ------------------------
 
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
-      MEs_sv += deltaMEs; // fix #435
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous );
-      MEs_sv_previous += deltaMEs_previous;
-#endif
-      /*
+      // *** DIAGRAMS 1 TO 3 ***
 #ifdef MGONGPUCPP_GPUIMPL
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv );
-#else
-#ifdef MGONGPU_CPPSIMD
-      if( cNGoodHel > 0 )
-        for( int ieppV = 0; ieppV < neppV; ieppV++ )
-          printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] );
+      gpuLaunchKernelStream( diagram1, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators, momenta, ihel );
+      gpuLaunchKernelStream( diagram2, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
+      gpuLaunchKernelStream( diagram3, gpublocks, gputhreads, gpustream, wfs, jamps, channelIds, couplings, numerators, denominators );
 #else
-      if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv );
-#endif
+      diagram1( wfs, jamps, channelIds, COUPs, numerators, denominators, momenta, ihel );
+      diagram2( wfs, jamps, channelIds, COUPs, numerators, denominators );
+      diagram3( wfs, jamps, channelIds, COUPs, numerators, denominators );
 #endif
-      */
-    } // END LOOP ON IPARITY
-    mgDebug( 1, __FUNCTION__ );
+    }
+    // *****************************
+    // *** END LOOP ON IPARITY ***
+    // *****************************
+
     return;
   }
 
@@ -552,7 +450,11 @@ namespace mg5amcCpu
 #else
     memcpy( cHel, tHel, ncomb * npar * sizeof( short ) );
 #endif
-    fpeEnable(); // enable SIGFPE traps for Floating Point Exceptions
+
+    // Enable SIGFPE traps for Floating Point Exceptions
+#ifdef MGONGPUCPP_DEBUG
+    fpeEnable();
+#endif
   }
 
   //--------------------------------------------------------------------------
@@ -585,6 +487,10 @@ namespace mg5amcCpu
     m_masses.push_back( m_pars->ZERO );
     m_masses.push_back( m_pars->mdl_MT );
     m_masses.push_back( m_pars->mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
     // Read physics parameters like masses and couplings from user configuration files (static: initialize once)
     // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory
     const fptype tIPD[nIPD] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT };
@@ -625,6 +531,10 @@ namespace mg5amcCpu
     m_masses.push_back( Parameters_MSSM_SLHA2::ZERO );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
     m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT );
+#ifdef MGONGPUCPP_GPUIMPL
+    // Create the normalized color matrix in device memory
+    createNormalizedColorMatrix();
+#endif
   }
 #endif
 
@@ -727,26 +637,26 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL
     using namespace mg5amcGpu;
     using G_ACCESS = DeviceAccessGs;
-    using C_ACCESS = DeviceAccessCouplings;
-    G2COUP<G_ACCESS, C_ACCESS>( allgs, allcouplings, bsmIndepParam );
+    using CD_ACCESS = DeviceAccessCouplings;
+    G2COUP<G_ACCESS, CD_ACCESS>( allgs, allcouplings, bsmIndepParam );
 #else
     using namespace mg5amcCpu;
     using G_ACCESS = HostAccessGs;
-    using C_ACCESS = HostAccessCouplings;
+    using CD_ACCESS = HostAccessCouplings;
     for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV )
     {
       const int ievt0 = ipagV * neppV;
       const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 );
       fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 );
-      G2COUP<G_ACCESS, C_ACCESS>( gs, couplings, bsmIndepParam );
+      G2COUP<G_ACCESS, CD_ACCESS>( gs, couplings, bsmIndepParam );
     }
 #endif
   }
 
   //--------------------------------------------------------------------------
 
-#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+#ifdef MGONGPUCPP_GPUIMPL
+  void /* clang-format off */
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
@@ -754,25 +664,40 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel )           // output: isGoodHel[ncomb] - device array (CUDA implementation)
-  {                                                         /* clang-format on */
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
+                       fptype* allJamps,           // tmp: jamp[ncolor*2*nevt] _for one helicity_ (reused in the getGoodHel helicity loop)
+                       fptype* allWfs,             // tmp: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
+                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  { /* clang-format on */
+    const int maxtry0 = 16;
+    fptype hstMEs[maxtry0];
+    const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     for( int ihel = 0; ihel < ncomb; ihel++ )
     {
+      const int gpublocks = 1;
+      const int gputhreads = maxtry;
       // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
-      allMEs[ievt] = 0;
-      // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s)
-      constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
+      gpuMemset( allMEs, 0, maxtry * sizeof( fptype ) );
+      gpuMemset( allJamps, 0, maxtry * ncolor * mgOnGpu::nx2 * sizeof( fptype ) );
+      // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      constexpr unsigned int* allChannelIds = nullptr; // disable multichannel single-diagram enhancement
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, allChannelIds, allNumerators, allDenominators, 0, gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, allJamps, allWfs, 0, gpublocks, gputhreads );
 #endif
-      if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+      color_sum_gpu( allMEs, allJamps, nullptr, nullptr, nullptr, gpublocks, gputhreads );
+      gpuMemcpy( hstMEs, allMEs, maxtry * sizeof( fptype ), gpuMemcpyDeviceToHost );
+      //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
+      for( int ievt = 0; ievt < maxtry; ++ievt )
       {
-        //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
-        isGoodHel[ihel] = true;
+        //std::cout << "sigmaKin_getGoodHel hstMEs[ievt]=" << hstMEs[ievt] << std::endl;
+        if( hstMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
+        {
+          //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
+          isGoodHel[ihel] = true;
+        }
       }
     }
   }
@@ -785,7 +710,7 @@ namespace mg5amcCpu
                        fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
                        fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
                        const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
   {
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -795,26 +720,27 @@ namespace mg5amcCpu
     // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV)
     assert( nevt >= neppV );
     const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
-
+    //std::cout << "sigmaKin_getGoodHel nevt=" << nevt << " maxtry=" << maxtry << std::endl;
     // HELICITY LOOP: CALCULATE WAVEFUNCTIONS
     const int npagV = maxtry / neppV;
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
     // Mixed fptypes #537: float for color algebra and double elsewhere
     // Delay color algebra and ME updates (only on even pages)
-    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
+    assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
     const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
 #else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
-#endif
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
+#endif /* clang-format on */
     for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 )
     {
-#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */
       const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time
 #else
       const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time
-#endif
+#endif /* clang-format on */
       for( int ihel = 0; ihel < ncomb; ihel++ )
       {
+        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << std::endl;
         // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
@@ -825,17 +751,23 @@ namespace mg5amcCpu
           allMEs[ievt2] = 0;
 #endif
         }
-        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
         //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
 #endif
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
+        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 ); //maxtry?
+#else
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 ); //maxtry?
+#endif /* clang-format on */
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         for( int ieppV = 0; ieppV < neppV; ++ieppV )
         {
           const int ievt = ievt00 + ieppV;
+          //std::cout << "sigmaKin_getGoodHel allMEs[ievt]=" << allMEs[ievt] << std::endl;
           if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0
           {
             //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
@@ -872,35 +804,217 @@ namespace mg5amcCpu
       }
     }
 #ifdef MGONGPUCPP_GPUIMPL
-    gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) );
-    gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) );
-#else
+    gpuMemcpyToSymbol( dcNGoodHel, &nGoodHel, sizeof( int ) );
+    gpuMemcpyToSymbol( dcGoodHel, goodHel, ncomb * sizeof( int ) );
+#endif
     cNGoodHel = nGoodHel;
     for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel];
-#endif
     return nGoodHel;
   }
 
   //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  __global__ void
+  normalise_output( fptype* allMEs,                    // output: allMEs[nevt], |M|^2 running_sum_over_helicities
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+                    fptype* ghelAllNumerators,         // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    fptype* ghelAllDenominators,       // input/tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                    const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+#endif
+                    const fptype globaldenom ) /* clang-format on */
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    allMEs[ievt] /= globaldenom;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const int nevt = gridDim.x * blockDim.x;
+    if( allChannelIds != nullptr ) // fix segfault #892 (not 'channelIds[0] != 0')
+    {
+      fptype* totAllNumerators = ghelAllNumerators;     // reuse "helicity #0" buffer to compute the total over all helicities
+      fptype* totAllDenominators = ghelAllDenominators; // reuse "helicity #0" buffer to compute the total over all helicities
+      for( int ighel = 1; ighel < dcNGoodHel; ighel++ ) // NB: the loop starts at ighel=1
+      {
+        fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+        fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+        totAllNumerators[ievt] += hAllNumerators[ievt];
+        totAllDenominators[ievt] += hAllDenominators[ievt];
+      }
+      allMEs[ievt] *= totAllNumerators[ievt] / totAllDenominators[ievt];
+    }
+#endif
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  add_and_select_hel( int* allselhel,          // output: helicity selection[nevt]
+                      const fptype* allrndhel, // input: random numbers[nevt] for helicity selection
+                      fptype* ghelAllMEs,      // input/tmp: allMEs for nGoodHel <= ncomb individual/runningsum helicities (index is ighel)
+                      fptype* allMEs,          // output: allMEs[nevt], final sum over helicities
+                      const int nevt )         // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // Compute the sum of MEs over all good helicities (defer this after the helicity loop to avoid breaking streams parall>
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      allMEs[ievt] += ghelAllMEs[ighel * nevt + ievt];
+      ghelAllMEs[ighel * nevt + ievt] = allMEs[ievt]; // reuse the buffer to store the running sum for helicity selection
+    }
+    // Event-by-event random choice of helicity #403
+    //printf( "select_hel: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+    for( int ighel = 0; ighel < dcNGoodHel; ighel++ )
+    {
+      if( allrndhel[ievt] < ( ghelAllMEs[ighel * nevt + ievt] / allMEs[ievt] ) )
+      {
+        const int ihelF = dcGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
+        allselhel[ievt] = ihelF;
+        //printf( "select_hel: ievt=%4d ihel=%4d\n", ievt, ihelF );
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  update_jamp2s( const fptype_sv* allJamps, // input: jamp[ncolor*2*nevt] for this helicity
+                 fptype* colAllJamp2s )     // output: allJamp2s[ncolor][nevt] super-buffer, sum over col/hel (nullptr to disable)
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    using J2_ACCESS = DeviceAccessJamp2;
+    for( int icol = 0; icol < ncolor; icol++ )
+      // NB: atomicAdd is needed after moving to cuda streams with one helicity per stream!
+      atomicAdd( &J2_ACCESS::kernelAccessIcol( colAllJamp2s, icol ), cxabs2( J_ACCESS::kernelAccessIcolConst( allJamps, icol ) ) );
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+  __global__ void
+  select_col( int* allselcol,                    // output: color selection[nevt]
+              const fptype* allrndcol,           // input: random numbers[nevt] for color selection
+              const unsigned int* allChannelIds, // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable SDE enhancement (fix #899/#911)
+              const fptype_sv* allJamp2s,        // input: jamp2[ncolor][nevt] for color choice (nullptr if disabled)
+              const int nevt )                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread)
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( allChannelIds );
+    // Event-by-event random choice of color #402
+    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
+    {
+      if( channelId > mgOnGpu::nchannels )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
+        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
+      }
+      // Determine the jamp2 for this event (TEMPORARY? could do this with a dedicated memory accessor instead...)
+      fptype_sv jamp2_sv[ncolor] = { 0 };
+      assert( allJamp2s != nullptr ); // sanity check
+      using J2_ACCESS = DeviceAccessJamp2;
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+        jamp2_sv[icolC] = J2_ACCESS::kernelAccessIcolConst( allJamp2s, icolC );
+      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
+      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
+      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
+      if( iconfig <= 0 )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
+        assert( iconfig > 0 ); // SANITY CHECK #917
+      }
+      else if( iconfig > (int)mgOnGpu::nconfigSDE )
+      {
+        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
+        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
+      }
+      fptype targetamp[ncolor] = { 0 };
+      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( icolC == 0 )
+          targetamp[icolC] = 0;
+        else
+          targetamp[icolC] = targetamp[icolC - 1];
+        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
+        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
+      }
+      //printf( "select_col: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
+      for( int icolC = 0; icolC < ncolor; icolC++ )
+      {
+        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
+        {
+          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
+          //printf( "select_col: ievt=%d icol=%d\n", ievt, icolC+1 );
+          break;
+        }
+      }
+    }
+    else
+    {
+      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+    }
+    return;
+  }
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
   // Evaluate |M|^2, part independent of incoming flavour
 
-  __global__ void /* clang-format off */
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+#endif
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads )              // input: cuda gputhreads
+#else
+  void
+  sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
+            const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
+            const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-#ifndef MGONGPUCPP_GPUIMPL
-            , const int nevt                    // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: multichannel numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: multichannel denominators[nevt], running_sum_over_helicities
 #endif
-            ) /* clang-format on */
+            const int nevt                      // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+            )
+#endif /* clang-format on */
   {
     mgDebugInitialise();
 
@@ -917,13 +1031,7 @@ namespace mg5amcCpu
     // Denominators: spins, colors and identical particles
     constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343)
 
-#ifdef MGONGPUCPP_GPUIMPL
-    // Remember: in CUDA this is a kernel for one event, in c++ this processes n events
-    const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    using CID_ACCESS = DeviceAccessChannelIds; // non-trivial access: buffer includes all events
-#endif
-#else
+#ifndef MGONGPUCPP_GPUIMPL
     //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
     using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
@@ -935,17 +1043,20 @@ namespace mg5amcCpu
 #endif
 
     // Start sigmaKin_lines
-
 #include "GpuAbstraction.h"
 
-    // === PART 0 - INITIALISATION (before calculate_wavefunctions) ===
+    // === PART 0 - INITIALISATION (before calculate_jamps) ===
     // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] = 0;
+    const int nevt = gpublocks * gputhreads;
+    gpuMemset( allMEs, 0, nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllJamps, 0, cNGoodHel * ncolor * mgOnGpu::nx2 * nevt * sizeof( fptype ) );
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    allNumerators[ievt] = 0;
-    allDenominators[ievt] = 0;
+    gpuMemset( colAllJamp2s, 0, ncolor * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllNumerators, 0, cNGoodHel * nevt * sizeof( fptype ) );
+    gpuMemset( ghelAllDenominators, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #endif
+    gpuMemset( ghelAllMEs, 0, cNGoodHel * nevt * sizeof( fptype ) );
 #else
     const int npagV = nevt / neppV;
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
@@ -971,93 +1082,63 @@ namespace mg5amcCpu
 #ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++
 
     // *** START OF PART 1a - CUDA (one event per GPU thread) ***
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // SCALAR channelId for the current event (CUDA) or for the whole SIMD event page (C++)
-    // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a SIMD event page
-    unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
-    if( allChannelIds != nullptr )
-    {
-      const unsigned int* channelIds = allChannelIds;                            // fix #899 (distinguish channelIds and allChannelIds)
-      const uint_sv channelIds_sv = CID_ACCESS::kernelAccessConst( channelIds ); // fix #895 (compute this only once for all diagrams)
-      // NB: channelIds_sv is a scalar in CUDA
-      channelId = channelIds_sv;
-      assert( channelId > 0 ); // SANITY CHECK: scalar channelId must be > 0 if multichannel is enabled (allChannelIds != nullptr)
-    }
-#endif
-    // Running sum of partial amplitudes squared for event by event color selection (#402)
-    // (for the single event processed in calculate_wavefunctions)
-    fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-    fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event)
+    // Use CUDA/HIP streams to process different helicities in parallel (one good helicity per stream)
+    // (1a) First, within each helicity stream, compute the QCD partial amplitudes jamp's for each helicity
+    // In multichannel mode, also compute the running sums over helicities of numerators, denominators and squared jamp2s
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
       const int ihel = cGoodHel[ighel];
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      fptype* hAllWfs = ghelAllWfs + ighel * nwf * nevt * nw6 * mgOnGpu::nx2;
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv );
+      fptype* hAllNumerators = ghelAllNumerators + ighel * nevt;
+      fptype* hAllDenominators = ghelAllDenominators + ighel * nevt;
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, allChannelIds, hAllNumerators, hAllDenominators, ghelStreams[ighel], gpublocks, gputhreads );
 #else
-      calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv );
+      calculate_jamps( ihel, allmomenta, allcouplings, hAllJamps, hAllWfs, ghelStreams[ighel], gpublocks, gputhreads );
 #endif
-      MEs_ighel[ighel] = allMEs[ievt];
     }
-    // Event-by-event random choice of helicity #403
-    //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // (1b) Then, In multichannel mode, also compute the running sums over helicities of squared jamp2s within each helicity stream
     for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) )
-      {
-        const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
-        allselhel[ievt] = ihelF;
-        //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
-        break;
-      }
-    }
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // Event-by-event random choice of color #402
-    if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783)
-    {
-      if( channelId > mgOnGpu::nchannels )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which is greater than nchannels=%d\n", channelId, mgOnGpu::nchannels );
-        assert( channelId <= mgOnGpu::nchannels ); // SANITY CHECK #919 #910
-      }
-      // NB (see #877): in the array channel2iconfig, the input index uses C indexing (channelId -1), the output index uses F indexing (iconfig)
-      // NB (see #917): mgOnGpu::channel2iconfig returns an int (which may be -1), not an unsigned int!
-      const int iconfig = mgOnGpu::channel2iconfig[channelId - 1]; // map N_diagrams to N_config <= N_diagrams configs (fix LHE color mismatch #856: see also #826, #852, #853)
-      if( iconfig <= 0 )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d which has no associated SDE iconfig\n", channelId );
-        assert( iconfig > 0 ); // SANITY CHECK #917
-      }
-      else if( iconfig > (int)mgOnGpu::nconfigSDE )
-      {
-        printf( "INTERNAL ERROR! Cannot choose an event-by-event random color for channelId=%d (invalid SDE iconfig=%d\n > nconfig=%d)", channelId, iconfig, mgOnGpu::nconfigSDE );
-        assert( iconfig <= (int)mgOnGpu::nconfigSDE ); // SANITY CHECK #917
-      }
-      fptype targetamp[ncolor] = { 0 };
-      // NB (see #877): explicitly use 'icolC' rather than 'icol' to indicate that icolC uses C indexing in [0, N_colors-1]
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( icolC == 0 )
-          targetamp[icolC] = 0;
-        else
-          targetamp[icolC] = targetamp[icolC - 1];
-        // NB (see #877): in the array icolamp, the input index uses C indexing (iconfig -1)
-        if( mgOnGpu::icolamp[iconfig - 1][icolC] ) targetamp[icolC] += jamp2_sv[icolC];
-      }
-      //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] );
-      for( int icolC = 0; icolC < ncolor; icolC++ )
-      {
-        if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) )
-        {
-          allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1]
-          //printf( "sigmaKin: ievt=%d icol=%d\n", ievt, icolC+1 );
-          break;
-        }
-      }
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+      gpuLaunchKernelStream( update_jamp2s, gpublocks, gputhreads, ghelStreams[ighel], hAllJamps, colAllJamp2s );
     }
+#endif
+    // (2) Then, within each helicity stream, compute the ME for that helicity from the color sum of QCD partial amplitudes jamps
+    if( !ghelBlasHandles )
+      assert( ghelAllBlasTmp == nullptr ); // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     else
+      assert( ghelAllBlasTmp != nullptr ); // note: this should never happen for HASBLAS=hasNoBlas (a sanity check is in color_sum_gpu)
+    for( int ighel = 0; ighel < cNGoodHel; ighel++ )
     {
-      allselcol[ievt] = 0; // no color selected in Fortran range [1,ncolor] if channelId == 0 (see #931)
+      fptype* hAllMEs = ghelAllMEs + ighel * nevt;
+      fptype* hAllJamps = ghelAllJamps + ighel * nevt * ncolor * mgOnGpu::nx2;
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) ); // reset the tmp buffer (bug fix: reset MEs=0)
+#else
+      fptype2* hAllBlasTmp = ( ghelAllBlasTmp != nullptr ? ghelAllBlasTmp + ighel * nevt * ncolor * mgOnGpu::nx2 : nullptr );
+      if( hAllBlasTmp )
+        gpuMemset( hAllBlasTmp, 0, nevt * ncolor * mgOnGpu::nx2 * sizeof( fptype2 ) ); // reset the tmp buffer (just in case...)
+#endif
+#ifndef MGONGPU_HAS_NO_BLAS
+      gpuBlasHandle_t* pBlasHandle = ( ghelBlasHandles ? &( ghelBlasHandles[ighel] ) : nullptr );
+#else /* clang-format off */
+      assert( ghelBlasHandles == nullptr ); // sanity check
+      gpuBlasHandle_t* pBlasHandle = nullptr; // this is a void* (hack to keep the same API in noBLAS builds)
+#endif /* clang-format on */
+      color_sum_gpu( hAllMEs, hAllJamps, hAllBlasTmp, ghelStreams[ighel], pBlasHandle, gpublocks, gputhreads );
     }
+    checkGpu( gpuDeviceSynchronize() ); // do not start helicity/color selection until the loop over helicities has completed
+    // (3) Wait for all helicity streams to complete, then finally compute the ME sum over all helicities and choose one helicity and one color
+    // Event-by-event random choice of helicity #403 and ME sum over helicities (defer this after the helicity loop to avoid breaking streams parallelism)
+    gpuLaunchKernel( add_and_select_hel, gpublocks, gputhreads, allselhel, allrndhel, ghelAllMEs, allMEs, gpublocks * gputhreads );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Event-by-event random choice of color #402
+    gpuLaunchKernel( select_col, gpublocks, gputhreads, allselcol, allrndcol, allChannelIds, colAllJamp2s, gpublocks * gputhreads );
 #endif
     // *** END OF PART 1a - CUDA (one event per GPU thread) ***
 
@@ -1099,7 +1180,7 @@ namespace mg5amcCpu
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
       // SCALAR channelId for the whole SIMD neppV2 event page (C++), i.e. one or two neppV event page(s)
       // The cudacpp implementation ASSUMES (and checks! #898) that all channelIds are the same in a neppV2 SIMD event page
-      // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+      // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
       unsigned int channelId = 0; // disable multichannel single-diagram enhancement unless allChannelIds != nullptr
       if( allChannelIds != nullptr )
       {
@@ -1122,7 +1203,7 @@ namespace mg5amcCpu
         // Second neppV page of channels (iParity=1 => ievt0 = ievt00 + 1 * neppV)
         const unsigned int* channelIds2 = CID_ACCESS::ieventAccessRecordConst( allChannelIds, ievt00 + neppV ); // fix bug #899/#911
         uint_v channelIds2_v = CID_ACCESS::kernelAccessConst( channelIds2 );                                    // fix #895 (compute this only once for all diagrams)
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
         for( int i = 0; i < neppV; ++i )
         {
           assert( channelId == channelIds2_v[i] ); // SANITY CHECKS #898 #924: all events in the 2nd SIMD vector have the same channelId as that of the 1st SIMD vector
@@ -1131,25 +1212,31 @@ namespace mg5amcCpu
       }
 #endif
       // Running sum of partial amplitudes squared for event by event color selection (#402)
-      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions)
-      fptype_sv jamp2_sv[nParity * ncolor] = { 0 };
-      fptype_sv MEs_ighel[ncomb] = { 0 };  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
+      fptype_sv jamp2_sv[nParity * ncolor] = {};
+      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
-      fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page)
+      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
 #endif
       for( int ighel = 0; ighel < cNGoodHel; ighel++ )
       {
         const int ihel = cGoodHel[ighel];
+        cxtype_sv jamp_sv[nParity * ncolor] = {}; // fixed nasty bug (omitting 'nParity' caused memory corruptions after calling calculate_jamps)
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-        // **NB! in "mixed" precision, using SIMD, calculate_wavefunctions computes MEs for TWO neppV pages with a single channelId! #924
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 );
+        // **NB! in "mixed" precision, using SIMD, calculate_jamps computes MEs for TWO neppV pages with a single channelId! #924
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, channelId, allNumerators, allDenominators, ievt00 );
 #else
-        calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 );
+        calculate_jamps( ihel, allmomenta, allcouplings, jamp_sv, ievt00 );
 #endif
+        color_sum_cpu( allMEs, jamp_sv, ievt00 );
         MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) );
 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
         MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) );
 #endif
+        using J_ACCESS = HostAccessJamp;
+        for( int iParity = 0; iParity < nParity; ++iParity )
+          for( int icol = 0; icol < ncolor; icol++ )
+            jamp2_sv[ncolor * iParity + icol] += cxabs2( J_ACCESS::kernelAccessIcol( &( jamp_sv[ncolor * iParity] ), icol ) ); // may underflow #831
       }
       // Event-by-event random choice of helicity #403
       for( int ieppV = 0; ieppV < neppV; ++ieppV )
@@ -1159,8 +1246,10 @@ namespace mg5amcCpu
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
 #if defined MGONGPU_CPPSIMD
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel][ieppV] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] );
 #else
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt, ighel, MEs_ighel[ighel] );
           const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] );
 #endif
           if( okhel )
@@ -1176,11 +1265,12 @@ namespace mg5amcCpu
         //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] );
         for( int ighel = 0; ighel < cNGoodHel; ighel++ )
         {
+          //printf( "sigmaKin: ievt=%4d ighel=%d MEs_ighel=%f\n", ievt2, ighel, MEs_ighel2[ighel][ieppV] );
           if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) )
           {
             const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1]
             allselhel[ievt2] = ihelF;
-            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF );
+            //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt2, ihelF );
             break;
           }
         }
@@ -1282,14 +1372,15 @@ namespace mg5amcCpu
 
 #endif // CUDA or C++
 
-    // === PART 2 - FINALISATION (after calculate_wavefunctions) ===
+    // PART 2 - FINALISATION (after calculate_jamps)
     // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event
     // [NB 'sum over final spins, average over initial spins', eg see
     // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf]
 #ifdef MGONGPUCPP_GPUIMPL
-    allMEs[ievt] /= helcolDenominators[0];
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    if( allChannelIds != nullptr ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // fix segfault #892 (not 'channelIds[0] != 0')
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, ghelAllNumerators, ghelAllDenominators, allChannelIds, helcolDenominators[0] );
+#else
+    gpuLaunchKernel( normalise_output, gpublocks, gputhreads, allMEs, helcolDenominators[0] );
 #endif
 #else
     for( int ipagV = 0; ipagV < npagV; ++ipagV )
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
index 24c27005b8..5acfd9f387 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -19,6 +19,7 @@
 
 #include "mgOnGpuVectors.h"
 
+#include "GpuAbstraction.h"
 #include "Parameters_MSSM_SLHA2.h"
 
 #include <vector>
@@ -75,17 +76,17 @@ namespace mg5amcCpu
     static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu-
     static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar)
     static constexpr int ndiagrams = 3; // #Feynman diagrams: e.g. 3 for e+ e- -> mu+ mu-
+    static constexpr int ncolor = 2; // the number of leading colors: e.g. 1 for e+ e- -> mu+ mu-
 
     // Hardcoded parameters for this process (constant class variables)
     // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)]
     // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)]
-    // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)]
-    //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
+    // [NB: I was unable to get the right value of nwf in CPPProcess.h directly, so I added it with a hack after generating CPPProcess.cc (#644)]
+    static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z)
 
     // Other variables of this instance (???)
     //static const int ninitial = CPPProcess::npari;
     //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles)
-    //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here)
     //static const int namplitudes = 3;
     //static const int ncomb = 16; // CPPProcess::ncomb
 
@@ -122,23 +123,26 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
-                       bool* isGoodHel );          // output: isGoodHel[ncomb] - device array (CUDA implementation)
+                       fptype* allJamps,           // output: jamp[ncolor*2*nevt]
+                       fptype* allWfs,             // output: wf[nwf*nw6*2*nevt]
+                       bool* isGoodHel,            // output: isGoodHel[ncomb] - device array (GPU device implementation)
+                       const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #else
-  __global__ void
+  void
   sigmaKin_getGoodHel( const fptype* allmomenta,   // input: momenta[nevt*npar*4]
                        const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                        fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,      // output: numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators,    // output: denominators[nevt], running_sum_over_helicities
 #endif
                        bool* isGoodHel,            // output: isGoodHel[ncomb] - host array (C++ implementation)
                        const int nevt );           // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
@@ -152,34 +156,46 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
+#endif
             fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
+            int* allselhel,                     // output: helicity selection[nevt]
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899/#911)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            int* allselcol,                     // output: helicity selection[nevt]
+            fptype* colAllJamp2s,               // tmp: allJamp2s super-buffer for ncolor individual colors, running sum over colors and helicities
+            fptype* ghelAllNumerators,          // tmp: allNumerators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllDenominators,        // tmp: allDenominators super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
 #endif
-            int* allselhel,                     // output: helicity selection[nevt]
-            int* allselcol                      // output: helicity selection[nevt]
-            );
+            fptype* ghelAllMEs,                 // tmp: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllJamps,               // tmp: allJamps super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype* ghelAllWfs,                 // tmp: allWfs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            fptype2* ghelAllBlasTmp,            // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+            gpuBlasHandle_t* ghelBlasHandles,   // input: cuBLAS/hipBLAS handles (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            gpuStream_t* ghelStreams,           // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+            const int gpublocks,                // input: cuda gpublocks
+            const int gputhreads );             // input: cuda gputhreads
 #else
-  __global__ void
+  void
   sigmaKin( const fptype* allmomenta,           // input: momenta[nevt*npar*4]
             const fptype* allcouplings,         // input: couplings[nevt*ndcoup*2]
             const fptype* allrndhel,            // input: random numbers[nevt] for helicity selection
-            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
-            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-            const unsigned int* allChannelIds,  // input: multichannel channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
-            fptype* allNumerators,              // output: multichannel numerators[nevt], running_sum_over_helicities
-            fptype* allDenominators,            // output: multichannel denominators[nevt], running_sum_over_helicities
+            const fptype* allrndcol,            // input: random numbers[nevt] for color selection
+            const unsigned int* allChannelIds,  // input: channelIds[nevt] (1 to #diagrams); nullptr to disable single-diagram enhancement (fix #899)
 #endif
+            fptype* allMEs,                     // output: allMEs[nevt], |M|^2 final_avg_over_helicities
             int* allselhel,                     // output: helicity selection[nevt]
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
             int* allselcol,                     // output: helicity selection[nevt]
+            fptype* allNumerators,              // tmp: numerators[nevt], running_sum_over_helicities
+            fptype* allDenominators,            // tmp: denominators[nevt], running_sum_over_helicities
+#endif
             const int nevt );                   // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
 #endif /* clang-format on */
 
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc
new file mode 100644
index 0000000000..b96d73fb5c
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.cc
@@ -0,0 +1,383 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#include "color_sum.h"
+
+#include "mgOnGpuConfig.h"
+
+#include "MemoryAccessMatrixElements.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  constexpr int ncolor = CPPProcess::ncolor; // the number of leading colors
+
+  //--------------------------------------------------------------------------
+
+  // *** COLOR MATRIX BELOW ***
+
+  // The color denominators (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorDenom[ncolor] = { 3, 3 }; // 1-D array[2]
+
+  // The color matrix (initialize all array elements, with ncolor=2)
+  // [NB do keep 'static' for these constexpr arrays, see issue #283]
+  static constexpr fptype2 colorMatrix[ncolor][ncolor] = {
+    { 16, -2 },
+    { -2, 16 } }; // 2-D array[2][2]
+
+#ifdef MGONGPUCPP_GPUIMPL
+  // The normalized color matrix (divide each column by denom)
+  template<typename T>
+  struct NormalizedColorMatrix
+  {
+    constexpr __host__ __device__ NormalizedColorMatrix()
+      : value()
+    {
+      for( int icol = 0; icol < ncolor; icol++ )
+        for( int jcol = 0; jcol < ncolor; jcol++ )
+          value[icol * ncolor + jcol] = colorMatrix[icol][jcol] / colorDenom[icol];
+    }
+    T value[ncolor * ncolor];
+  };
+  // The fptype2 version is the default used by kernels (supporting mixed floating point mode also in blas)
+  static __device__ fptype2 s_pNormalizedColorMatrix2[ncolor * ncolor];
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix()
+  {
+    static bool first = true;
+    if( first )
+    {
+      first = false;
+      constexpr NormalizedColorMatrix<fptype2> normalizedColorMatrix2;
+      gpuMemcpyToSymbol( s_pNormalizedColorMatrix2, normalizedColorMatrix2.value, ncolor * ncolor * sizeof( fptype2 ) );
+    }
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 )            // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+  {
+    // Pre-compute a constexpr triangular color matrix properly normalized #475
+    struct TriangularNormalizedColorMatrix
+    {
+      // See https://stackoverflow.com/a/34465458
+      __host__ __device__ constexpr TriangularNormalizedColorMatrix()
+        : value()
+      {
+        for( int icol = 0; icol < ncolor; icol++ )
+        {
+          // Diagonal terms
+          value[icol][icol] = colorMatrix[icol][icol] / colorDenom[icol];
+          // Off-diagonal terms
+          for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+            value[icol][jcol] = 2 * colorMatrix[icol][jcol] / colorDenom[icol];
+        }
+      }
+      fptype2 value[ncolor][ncolor];
+    };
+    static constexpr auto cf2 = TriangularNormalizedColorMatrix();
+    // Use the property that M is a real matrix (see #475):
+    // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB
+    // In addition, on C++ use the property that M is symmetric (see #475),
+    // and also use constexpr to compute "2*" and "/colorDenom[icol]" once and for all at compile time:
+    // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix.
+    // Strangely, CUDA is slower instead, so keep the old implementation for the moment.
+    fptype_sv deltaMEs = { 0 };
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype_sv deltaMEs_next = { 0 };
+    // Mixed mode: merge two neppV vectors into one neppV2 vector
+    fptype2_sv jampR_sv[ncolor];
+    fptype2_sv jampI_sv[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      jampR_sv[icol] = fpvmerge( cxreal( allJamp_sv[icol] ), cxreal( allJamp_sv[ncolor + icol] ) );
+      jampI_sv[icol] = fpvmerge( cximag( allJamp_sv[icol] ), cximag( allJamp_sv[ncolor + icol] ) );
+    }
+#else
+    const cxtype_sv* jamp_sv = allJamp_sv;
+#endif
+    // Loop over icol
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      // Diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      fptype2_sv& jampRi_sv = jampR_sv[icol];
+      fptype2_sv& jampIi_sv = jampI_sv[icol];
+#else
+      fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) );
+      fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) );
+#endif
+      fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv;
+      fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv;
+      // Loop over jcol
+      for( int jcol = icol + 1; jcol < ncolor; jcol++ )
+      {
+        // Off-diagonal terms
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+        fptype2_sv& jampRj_sv = jampR_sv[jcol];
+        fptype2_sv& jampIj_sv = jampI_sv[jcol];
+#else
+        fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) );
+        fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) );
+#endif
+        ztempR_sv += cf2.value[icol][jcol] * jampRj_sv;
+        ztempI_sv += cf2.value[icol][jcol] * jampIj_sv;
+      }
+      fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); // may underflow #831
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+      deltaMEs += fpvsplit0( deltaMEs2 );
+      deltaMEs_next += fpvsplit1( deltaMEs2 );
+#else
+      deltaMEs += deltaMEs2;
+#endif
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events
+    fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 );
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs );
+    MEs_sv += deltaMEs; // fix #435
+#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    fptype* MEs_next = E_ACCESS::ieventAccessRecord( allMEs, ievt0 + neppV );
+    fptype_sv& MEs_sv_next = E_ACCESS::kernelAccess( MEs_next );
+    MEs_sv_next += deltaMEs_next;
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  __global__ void
+  color_sum_kernel( fptype* allMEs,          // output: allMEs[nevt], add |M|^2 for this specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    using J_ACCESS = DeviceAccessJamp;
+    fptype jampR[ncolor];
+    fptype jampI[ncolor];
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      cxtype jamp = J_ACCESS::kernelAccessIcolConst( allJamps, icol );
+      jampR[icol] = jamp.real();
+      jampI[icol] = jamp.imag();
+    }
+    // Loop over icol
+    fptype deltaMEs = { 0 };
+    for( int icol = 0; icol < ncolor; icol++ )
+    {
+      fptype2 ztempR = { 0 };
+      fptype2 ztempI = { 0 };
+      // Loop over jcol
+      for( int jcol = 0; jcol < ncolor; jcol++ )
+      {
+        fptype2 jampRj = jampR[jcol];
+        fptype2 jampIj = jampI[jcol];
+        ztempR += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampRj; // use fptype2 version of color matrix
+        ztempI += s_pNormalizedColorMatrix2[icol * ncolor + jcol] * jampIj; // use fptype2 version of color matrix
+      }
+      deltaMEs += ztempR * jampR[icol];
+      deltaMEs += ztempI * jampI[icol];
+    }
+    // *** STORE THE RESULTS ***
+    using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events
+    // NB: color_sum ADDS |M|^2 for one helicity to the running sum of |M|^2 over helicities for the given event(s)
+    E_ACCESS::kernelAccess( allMEs ) += deltaMEs; // fix #435
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertD2F_Jamps( fptype2* allJampsFpt2,   // output: jamp[ncolor*2*nevt] for one specific helicity
+                    const fptype* allJamps ) // input: jamp[ncolor*2*nevt] for one specific helicity
+  {
+    const int nevt = gridDim.x * blockDim.x;
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    // NB! From a functional point of view, any striding will be ok here as long as ncolor*2*nevt elements are all correctly copied!
+    // NB! Just in case this may be better for performance reasons, however, the same striding as in compute_jamps and cuBLAS is used here
+    for( int ix2 = 0; ix2 < mgOnGpu::nx2; ix2++ )
+      for( int icol = 0; icol < ncolor; icol++ )
+        allJampsFpt2[ix2 * ncolor * nevt + icol * nevt + ievt] = allJamps[ix2 * ncolor * nevt + icol * nevt + ievt]; // "new1" striding
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+#ifndef MGONGPU_HAS_NO_BLAS
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+  __global__ void
+  convertF2D_MEs( fptype* allMEs,             // output: allMEs[nevt] for one specific helicity
+                  const fptype2* allMEsFpt2 ) // input: allMEs[nevt] for one specific helicity
+  {
+    const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+    allMEs[ievt] = allMEsFpt2[ievt];
+  }
+#endif
+#endif
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+#ifndef MGONGPU_HAS_NO_BLAS
+  void
+  color_sum_blas( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                  const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                  fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+                  gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#else
+                  gpuStream_t /*stream*/,       // input: cuda stream (nullptr indicates the default stream - only used for FPTYPE=m)
+#endif
+                  gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                  const int gpublocks,          // input: cuda gpublocks
+                  const int gputhreads )        // input: cuda gputhreads
+  {
+    const int nevt = gpublocks * gputhreads;
+
+    // Get the address associated with the normalized color matrix in device memory
+    static fptype2* devNormColMat = nullptr;
+    if( !devNormColMat ) gpuGetSymbolAddress( (void**)&devNormColMat, s_pNormalizedColorMatrix2 );
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Mixed precision mode: need two fptype2[ncolor*2*nevt] buffers and one fptype2[nevt] buffer per helicity
+    fptype2* allZtempBoth = allBlasTmp;                                  // start of first fptype2[ncolor*2*nevt] buffer
+    fptype2* allJampsFpt2 = allBlasTmp + ncolor * mgOnGpu::nx2 * nevt;   // start of second fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allBlasTmp + 2 * ncolor * mgOnGpu::nx2 * nevt; // start of fptype2[nevt] buffer
+    // Convert jamps from double to float
+    gpuLaunchKernelStream( convertD2F_Jamps, gpublocks, gputhreads, stream, allJampsFpt2, allJamps );
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJampsFpt2;
+    const fptype2* allJampsImag = allJampsFpt2 + ncolor * nevt;
+#else
+    static_assert( std::is_same<fptype2, fptype>::value );     // sanity check
+    // Standard single/double precision mode: need one fptype2[ncolor*2*nevt] buffer
+    fptype2* allZtempBoth = allBlasTmp; // start of fptype2[ncolor*2*nevt] buffer
+    fptype2* allMEsFpt2 = allMEs;
+    // Real and imaginary components
+    const fptype2* allJampsReal = allJamps;                 // this is not a cast (the two types are identical)
+    const fptype2* allJampsImag = allJamps + ncolor * nevt; // this is not a cast (the two types are identical)
+#endif
+    // Real and imaginary components
+    fptype2* allZtempReal = allZtempBoth;
+    fptype2* allZtempImag = allZtempBoth + ncolor * nevt;
+
+    // Note, new striding for cuBLAS from DeviceAccessJamp:
+    // - allJamps(icol,ievt).real is allJamps[0 * ncolor * nevt + icol * nevt + ievt] // "new1"
+    // - allJamps(icol,ievt).imag is allJamps[1 * ncolor * nevt + icol * nevt + ievt] // "new1"
+
+    // Step 1: Compute Ztemp[ncolor][nevt] = ColorMatrix[ncolor][ncolor] * JampsVector[ncolor][nevt] for both real and imag
+    // In this case alpha=1 and beta=0: the operation is Ztemp = alpha * ColorMatrix * JampsVector + beta * Ztemp
+    fptype2 alpha1 = 1;
+    fptype2 beta1 = 0;
+    const int ncolorM = ncolor;
+    const int nevtN = nevt;
+    const int ncolorK = ncolor;
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsReal, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempReal, ncolorM ) ); // Ztemp is ncolorM x nevtN
+    checkGpuBlas( gpuBlasTgemm( *pBlasHandle,
+                                GPUBLAS_OP_N,              // do not transpose ColMat
+                                GPUBLAS_OP_T,              // transpose JampsV (new1)
+                                ncolorM, nevtN, ncolorK,
+                                &alpha1,
+                                devNormColMat, ncolorM,    // ColMat is ncolorM x ncolorK
+                                allJampsImag, nevtN,       // JampsV is nevtN x ncolorK (new1)
+                                &beta1,
+                                allZtempImag, ncolorM ) ); // Ztemp is ncolorM x nevtN
+
+    // Step 2: For each ievt, compute the dot product of JampsVector[ncolor][ievt] dot tmp[ncolor][ievt]
+    // In this case alpha=1 and beta=1: the operation is ME = alpha * ( Tmp dot JampsVector ) + beta * ME
+    // Use cublasSgemmStridedBatched to perform these batched dot products in one call
+    fptype2 alpha2 = 1;
+    fptype2 beta2 = 1;
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsReal, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempReal, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+    checkGpuBlas( gpuBlasTgemmStridedBatched( *pBlasHandle,
+                                              GPUBLAS_OP_N,                   // do not transpose JampsV (new1)
+                                              GPUBLAS_OP_N,                   // do not transpose Tmp
+                                              1, 1, ncolor,                   // result is 1x1 (dot product)
+                                              &alpha2,
+                                              allJampsImag, nevt, 1,          // allJamps is nevt x ncolor, stride 1 for each ievt column (new1)
+                                              allZtempImag, ncolor, ncolor,   // allZtemp is ncolor x nevt, with stride ncolor for each ievt column
+                                              &beta2,
+                                              allMEsFpt2, 1, 1,               // output is a 1x1 result for each "batch" (i.e. for each ievt)
+                                              nevt ) );                       // there are nevt "batches"
+
+#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
+    // Convert MEs from float to double
+    gpuLaunchKernelStream( convertF2D_MEs, gpublocks, gputhreads, stream, allMEs, allMEsFpt2 );
+#endif
+  }
+#endif /* clang-format on */
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
+  {
+#ifdef MGONGPU_HAS_NO_BLAS
+    assert( allBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas
+    assert( pBlasHandle == nullptr ); // sanity check for HASBLAS=hasNoBlas
+#endif
+    if( !pBlasHandle ) // HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+    {
+      assert( allBlasTmp == nullptr );
+      gpuLaunchKernelStream( color_sum_kernel, gpublocks, gputhreads, stream, allMEs, allJamps );
+    }
+#ifndef MGONGPU_HAS_NO_BLAS
+    else
+    {
+      assert( allBlasTmp != nullptr );
+      color_sum_blas( allMEs, allJamps, allBlasTmp, stream, pBlasHandle, gpublocks, gputhreads );
+    }
+#endif
+  }
+#endif
+
+  //--------------------------------------------------------------------------
+
+} // end namespace
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h
new file mode 120000
index 0000000000..24b0157011
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/color_sum.h
@@ -0,0 +1 @@
+../color_sum.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagram_boilerplate.h
new file mode 120000
index 0000000000..e657b15c20
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagram_boilerplate.h
@@ -0,0 +1 @@
+../diagram_boilerplate.h
\ No newline at end of file
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagrams.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagrams.h
new file mode 100644
index 0000000000..faf1602413
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/diagrams.h
@@ -0,0 +1,106 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+/* clang-format off */
+
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram1( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators,           // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+            const fptype* momenta,          // input: momenta[npar*4*nevtORneppV]
+            const int ihel )                // input: helicity (0 to ncomb)
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+#ifdef MGONGPUCPP_GPUIMPL
+    using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events
+#else
+    using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events
+#endif
+    // *** DIAGRAM 1 OF 3 ***
+    // Wavefunction(s) for diagram number 1
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 );
+    vxxxxx<M_ACCESS, W_ACCESS>( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 );
+    oxxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 );
+    ixxxxx<M_ACCESS, W_ACCESS>( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 );
+    VVV1P0_1<W_ACCESS, CD_ACCESS>( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] );
+    // Amplitude(s) for diagram number 1
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) += cxtype( 0, 1 ) * amp_sv[0];
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= cxtype( 0, 1 ) * amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram2( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 2 OF 3 ***
+    // Wavefunction(s) for diagram number 2
+    FFV1_1<W_ACCESS, CD_ACCESS>( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 2
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 0 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+  __global__ void
+  diagram3( fptype* wfs,                    // input/output wavefunctions[nwf*2*nw6*nevtORneppV]
+            fptype* jamps,                  // output jamps[ncolor*2*nevtORneppV]
+            const unsigned int* channelIds, // input: channelIds[nevt] for GPU or SCALAR channelId[0] for C++ (1 to #diagrams, 0 to disable SDE)
+#ifdef MGONGPUCPP_GPUIMPL
+            const fptype* couplings,        // input: dependent couplings[nevt*ndcoup*2] for all events
+#else
+            const fptype** COUPs,           // input: dependent and independent COUPs[nxcoup] for this event page
+#endif
+            fptype* numerators,             // input/output: multichannel numerators[nevtORneppV], add helicity ihel
+            fptype* denominators )          // input/output: multichannel denominators[nevtORneppV], add helicity ihel
+  {
+    // A uniform interface for diagramXXX including channelIDs, numerators and denominators is used also #ifndef MGONGPU_SUPPORTS_MULTICHANNEL
+    // In that case, however, the boilerplate code asserts that all three pointers all nullptr as a sanity check
+#include "diagram_boilerplate.h"
+    // *** DIAGRAM 3 OF 3 ***
+    // Wavefunction(s) for diagram number 3
+    FFV1_2<W_ACCESS, CD_ACCESS>( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] );
+    // Amplitude(s) for diagram number 3
+    FFV1_0<W_ACCESS, A_ACCESS, CD_ACCESS>( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &amp_fp[0] );
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473)
+#endif
+    J_ACCESS::kernelAccessIcol( jamps, 1 ) -= amp_sv[0];
+  }
+  
+  //--------------------------------------------------------------------------
+
+/* clang-format on */
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h
new file mode 100644
index 0000000000..71b5b1eaad
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/color_sum.h
@@ -0,0 +1,105 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+#ifndef COLOR_SUM_H
+#define COLOR_SUM_H 1
+
+#include "mgOnGpuConfig.h"
+
+#include "mgOnGpuVectors.h"
+
+#include "CPPProcess.h"
+#include "GpuAbstraction.h"
+
+#ifdef MGONGPUCPP_GPUIMPL
+namespace mg5amcGpu
+#else
+namespace mg5amcCpu
+#endif
+{
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  class DeviceAccessJamp
+  {
+  public:
+    static __device__ inline cxtype_ref
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype_ref( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype_ref( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype_ref( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+    static __device__ inline const cxtype
+    kernelAccessIcolConst( const fptype* buffer, const int icol )
+    {
+      const int ncolor = CPPProcess::ncolor; // the number of leading colors
+      const int nevt = gridDim.x * blockDim.x;
+      const int ievt = blockDim.x * blockIdx.x + threadIdx.x;
+      // Original "old" striding for CUDA kernels: ncolor separate 2*nevt matrices for each color (ievt last)
+      //return cxtype( buffer[icol * 2 * nevt + ievt], buffer[icol * 2 * nevt + nevt + ievt] ); // "old"
+      // New "new1" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (ievt last)
+      // The "new1" striding is now used for both HASBLAS=hasBlas and hasNoBlas builds and for both CUDA kernels and cuBLAS
+      return cxtype( buffer[0 * ncolor * nevt + icol * nevt + ievt], buffer[1 * ncolor * nevt + icol * nevt + ievt] ); // "new1"
+      // New "new2" striding for cuBLAS: two separate ncolor*nevt matrices for each of real and imag (icol last)
+      //return cxtype( buffer[0 * nevt * ncolor + ievt * ncolor + icol], buffer[1 * nevt * ncolor + ievt * ncolor + icol] ); // "new2"
+    }
+  };
+#else
+  class HostAccessJamp
+  {
+  public:
+    static inline cxtype_sv&
+    kernelAccessIcol( cxtype_sv* buffer, const int icol )
+    {
+      return buffer[icol];
+    }
+    static inline cxtype_sv&
+    kernelAccessIcol( fptype* buffer, const int icol )
+    {
+      return reinterpret_cast<cxtype_sv*>( buffer )[icol];
+    }
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void createNormalizedColorMatrix();
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifndef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_cpu( fptype* allMEs,              // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const cxtype_sv* allJamp_sv, // input: jamp_sv[ncolor] (float/double) or jamp_sv[2*ncolor] (mixed) for one specific helicity
+                 const int ievt0 );           // input: first event number in current C++ event page (for CUDA, ievt depends on threadid)
+#endif
+
+  //--------------------------------------------------------------------------
+
+#ifdef MGONGPUCPP_GPUIMPL
+  void
+  color_sum_gpu( fptype* allMEs,               // output: allMEs[nevt], add |M|^2 for this specific helicity
+                 const fptype* allJamps,       // input: jamp[ncolor*2*nevt] for one specific helicity
+                 fptype2* allBlasTmp,          // tmp: blasTmp[ncolor*2*nevt] or blasTmp[(2*ncolor*2+1)*nevt] for one specific helicity
+                 gpuStream_t stream,           // input: cuda stream (nullptr indicates the default stream)
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads );       // input: cuda gputhreads
+#endif
+
+  //--------------------------------------------------------------------------
+}
+
+#endif // COLOR_SUM_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
index 20d8ded718..e7360b29e2 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin.
-# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: S. Hageboeck, D. Massaro, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html)
 #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts
@@ -114,7 +114,7 @@ export CXXFLAGS
 override CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null))
 
 # Set HIP_HOME from the path to hipcc, if it exists
-override HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null))
+override HIP_HOME = $(shell hipconfig --rocmpath)
 
 # Configure CUDA_INC (for CURAND and NVTX) and NVTX if a CUDA installation exists (see #965)
 ifeq ($(CUDA_HOME),)
@@ -229,6 +229,8 @@ ifeq ($(BACKEND),cuda)
 
 else ifeq ($(BACKEND),hip)
 
+  # example architecture values MI200:gfx90a, MI350X:gfx942
+  MADGRAPH_HIP_ARCHITECTURE ?= gfx942
   # Set GPUCC as $(HIP_HOME)/bin/hipcc (it was already checked above that this exists)
   GPUCC = $(HIP_HOME)/bin/hipcc
   XCOMPILERFLAG =
@@ -243,7 +245,7 @@ else ifeq ($(BACKEND),hip)
   ###GPUFLAGS += -ggdb # FOR DEBUGGING ONLY
 
   # AMD HIP architecture flags
-  GPUARCHFLAGS = --offload-arch=gfx90a
+  GPUARCHFLAGS = --offload-arch=${MADGRAPH_HIP_ARCHITECTURE}
   GPUFLAGS += $(GPUARCHFLAGS)
 
   # Other AMD-specific flags
@@ -477,6 +479,34 @@ endif
 
 #-------------------------------------------------------------------------------
 
+#=== Configure defaults and check if user-defined choices exist for HASBLAS
+
+# Set the default HASBLAS (cuBLAS/hipBLAS) choice and check prior choices for HASBLAS
+
+ifeq ($(HASBLAS),)
+  ifeq ($(GPUCC),) # CPU-only build
+    override HASBLAS = hasNoBlas
+  else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    ifeq ($(wildcard $(CUDA_HOME)/include/cublas_v2.h),)
+      # cuBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    ifeq ($(wildcard $(HIP_HOME)/include/hipblas/hipblas.h),)
+      # hipBLAS headers do not exist??
+      override HASBLAS = hasNoBlas
+    else
+      override HASBLAS = hasBlas
+    endif
+  else
+    override HASBLAS = hasNoBlas
+  endif
+endif
+
+#-------------------------------------------------------------------------------
+
 #=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD
 
 # Set the build flags appropriate to OMPFLAGS
@@ -597,6 +627,30 @@ endif
 #$(info RNDCXXFLAGS=$(RNDCXXFLAGS))
 #$(info RNDLIBFLAGS=$(RNDLIBFLAGS))
 
+#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASBLAS
+
+$(info HASBLAS=$(HASBLAS))
+override BLASCXXFLAGS=
+override BLASLIBFLAGS=
+
+# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASBLAS choice (example: "make HASBLAS=hasNoBlas")
+ifeq ($(HASBLAS),hasNoBlas)
+  override BLASCXXFLAGS += -DMGONGPU_HAS_NO_BLAS
+else ifeq ($(HASBLAS),hasBlas)
+  ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build
+    override BLASLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcublas
+  else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build
+    override BLASLIBFLAGS = -L$(HIP_HOME)/lib/ -lhipblas
+  endif
+else
+  $(error Unknown HASBLAS='$(HASBLAS)': only 'hasBlas' and 'hasNoBlas' are supported)
+endif
+CXXFLAGS += $(BLASCXXFLAGS)
+GPUFLAGS += $(BLASCXXFLAGS)
+
+#$(info BLASCXXFLAGS=$(BLASCXXFLAGS))
+#$(info BLASLIBFLAGS=$(BLASLIBFLAGS))
+
 #-------------------------------------------------------------------------------
 
 #=== Configure Position-Independent Code
@@ -780,12 +834,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
 MG5AMC_GPULIB = mg5amc_$(processid_short)_$(GPUSUFFIX)
-gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
+gpu_objects_lib=$(BUILDDIR)/CPPProcess_$(GPUSUFFIX).o $(BUILDDIR)/color_sum_$(GPUSUFFIX).o $(BUILDDIR)/MatrixElementKernels_$(GPUSUFFIX).o $(BUILDDIR)/BridgeKernels_$(GPUSUFFIX).o $(BUILDDIR)/CrossSectionKernels_$(GPUSUFFIX).o
 gpu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/RamboSamplingKernels_$(GPUSUFFIX).o
 endif
 
@@ -799,7 +853,7 @@ ifneq ($(GPUCC),)
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: gpu_objects_lib += $(BUILDDIR)/fbridge_$(GPUSUFFIX).o
 $(LIBDIR)/lib$(MG5AMC_GPULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib)
-	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB)
+	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) $(BLASLIBFLAGS)
 # Bypass std::filesystem completely to ease portability on LUMI #803
 #ifneq ($(findstring hipcc,$(GPUCC)),)
 #	$(GPUCC) --shared -o $@ $(gpu_objects_lib) $(GPULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs
@@ -832,6 +886,7 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531
 $(gpu_checkmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc
 endif
 $(gpu_checkmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_checkmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_checkmain): $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o
 	$(GPUCC) -o $@ $(BUILDDIR)/check_sa_$(GPUSUFFIX).o $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_$(GPUSUFFIX).o $(BUILDDIR)/HiprandRandomNumberKernel_$(GPUSUFFIX).o $(RNDLIBFLAGS)
 endif
@@ -876,9 +931,10 @@ ifeq ($(UNAME_S),Darwin)
 $(gpu_fcheckmain): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375
 endif
 $(gpu_fcheckmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_fcheckmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_fcheckmain): $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBDIR)/lib$(MG5AMC_GPULIB).so $(gpu_objects_exe)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe) -lstdc++ -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa_fortran.o $(BUILDDIR)/fsampler_$(GPUSUFFIX).o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_GPULIB) $(gpu_objects_exe)
 endif
@@ -977,9 +1033,10 @@ $(cxx_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_ob
 else # link only runTest_$(GPUSUFFIX).o (new: in the past, this was linking both runTest_cpp.o and runTest_$(GPUSUFFIX).o)
 ###$(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSASAN)
 $(gpu_testmain): LIBFLAGS += $(GPULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH
+$(gpu_testmain): LIBFLAGS += $(BLASLIBFLAGS)
 $(gpu_testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(gpu_objects_lib) $(gpu_objects_exe) $(GTESTLIBS)
 ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802
-	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(shell cd -L $(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../..; pwd)/lib -lamdhip64
+	$(FC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lstdc++ -lpthread -L$(HIP_HOME)/lib -lamdhip64
 else
 	$(GPUCC) -o $@ $(gpu_objects_lib) $(gpu_objects_exe) -ldl $(LIBFLAGS) -lcuda
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/diagram_boilerplate.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/diagram_boilerplate.h
new file mode 100644
index 0000000000..96a34fb1bf
--- /dev/null
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/diagram_boilerplate.h
@@ -0,0 +1,103 @@
+// Copyright (C) 2020-2025 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Nov 2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2024-2025) for the MG5aMC CUDACPP plugin.
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs" // for CI_ACCESS and CD_ACCESS
+
+#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */
+
+    //-------------
+    // GPU only
+    //-------------
+
+    //using namespace mg5amcGpu;
+    using W_ACCESS = DeviceAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = DeviceAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = DeviceAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = DeviceAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = DeviceAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = DeviceAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current event (CUDA)
+    unsigned int channelId = gpu_channelId( channelIds );
+#endif
+
+    // Wavefunctions
+    // Buffer wfs for one helicity and nevt events is a DeviceBufferSimple with ( nwf * nevt * nw6 * nx2 ) fptypes
+    // The striding between the nwf wavefunction buffers is ( nevt * nw6 * nx2 ) fptypes
+    // Internally diagramXXX methods pass a w_fp[iwf] to ixx/FFV methods (as argument 'fptype wavefunctions[]')
+    // Internally ixx/FFV methods call 'cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions )' and then use fi[iw6]
+    // This means that the fi pointer must point to a [RIRIRIRIRIRI] contiguous buffer of size nw6*nx2=12
+    // The striding between events is nw6*nx2=12 and this is what W_ACCESS::kernelAccess must respect
+    // (En passant, note that this means that events cannot be contiguous in the present code, memory is not coalesced)
+    const int nevt = gridDim.x * blockDim.x;
+    fptype* w_fp[nwf];
+    for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = wfs + iwf * nevt * nw6 * mgOnGpu::nx2;
+
+    // Couplings
+    constexpr size_t nxcoup = ndcoup + nIPC; // both dependent and independent couplings (FIX #823: nIPC instead of nicoup)
+    const fptype* allCOUPs[nxcoup];
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 186 // e.g. <<warning #186-D: pointless comparison of unsigned integer with zero>>
+#endif
+    // Dependent couplings, vary event-by-event
+    for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ )
+      allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( couplings, idcoup );
+    // Independent couplings, fixed for all events
+    for( size_t iicoup = 0; iicoup < nIPC; iicoup++ ) // (FIX #823: nIPC instead of nicoup)
+      allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup );
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+#pragma nv_diagnostic pop
+#endif
+    const fptype* COUPs[nxcoup];
+    for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup];
+
+#else
+
+    //-------------
+    // C++ only
+    //-------------
+
+    //using namespace mg5amcCpu;
+    using W_ACCESS = HostAccessWavefunctions;   // non-trivial access (with kernel splitting): buffer includes all events
+    using A_ACCESS = HostAccessAmplitudes;      // TRIVIAL ACCESS (local variable for one event): buffer for one event
+    using CD_ACCESS = HostAccessCouplings;      // non-trivial access (dependent couplings): buffer includes all events
+    using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event
+    using J_ACCESS = HostAccessJamp;
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    using NUM_ACCESS = HostAccessNumerators;    // non-trivial access: buffer includes all events
+    using DEN_ACCESS = HostAccessDenominators;  // non-trivial access: buffer includes all events
+    // SCALAR channelId for the current SIMD event page (C++)
+    unsigned int channelId = *channelIds;
+#endif
+
+    // Wavefunctions
+    // Reinterpret wfs as "cxtype_sv w_sv[nwf][nw6]" and build "fptype* w_fp[nwf]" where "w_fp[iwf] = (fptype*)( w_sv[iwf] )"
+    fptype (*w_fp)[nw6 * neppV * mgOnGpu::nx2] = (fptype (*)[nw6 * neppV * mgOnGpu::nx2])(wfs);
+
+#endif
+
+    //-------------
+    // GPU or C++
+    //-------------
+
+    // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV)
+    cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram
+    fptype* amp_fp;      // proof of concept for using fptype* in the interface
+    amp_fp = reinterpret_cast<fptype*>( amp_sv );
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // Numerators and denominators for the current event (CUDA) or SIMD event page (C++)
+    fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators );
+    fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators );
+#else
+    assert( channelIds == nullptr );
+    assert( numerators == nullptr );
+    assert( denominators == nullptr );
+#endif /* clang-format on */
+
+#pragma GCC diagnostic pop
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc
index 4eec5db13c..216a90a302 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 //----------------------------------------------------------------------------
 // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests
 //----------------------------------------------------------------------------
@@ -22,6 +22,8 @@
 #endif
 #include "epoch_process_id.h"
 
+#include <memory>
+
 #ifdef MGONGPUCPP_GPUIMPL
 using namespace mg5amcGpu;
 #else
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h
index 9ed58e24f1..22a10cc1e3 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h
@@ -2,13 +2,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Sep 2010) for the MG5aMC backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -860,7 +860,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -873,7 +873,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -885,7 +885,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -898,7 +898,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ INLINE void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -911,7 +911,7 @@ namespace mg5amcCpu
   //==========================================================================
 
   // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   VVV1P0_1( const fptype allV2[],
             const fptype allV3[],
@@ -924,7 +924,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 );
     const cxtype cI = cxmake( 0., 1. );
     const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) };
@@ -949,7 +949,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6]
-  template<class W_ACCESS, class A_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class A_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_0( const fptype allF1[],
           const fptype allF2[],
@@ -962,7 +962,7 @@ namespace mg5amcCpu
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes );
     const cxtype cI = cxmake( 0., 1. );
     const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) );
@@ -974,7 +974,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_1( const fptype allF2[],
           const fptype allV3[],
@@ -987,7 +987,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 );
     const cxtype cI = cxmake( 0., 1. );
     F1[0] = +F2[0] + V3[0];
@@ -1006,7 +1006,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6]
-  template<class W_ACCESS, class C_ACCESS>
+  template<class W_ACCESS, class CD_ACCESS>
   __device__ void
   FFV1_2( const fptype allF1[],
           const fptype allV3[],
@@ -1019,7 +1019,7 @@ namespace mg5amcCpu
     mgDebug( 0, __FUNCTION__ );
     const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 );
     const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 );
-    const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP );
+    const cxtype_sv COUP = CD_ACCESS::kernelAccessConst( allCOUP );
     cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 );
     const cxtype cI = cxmake( 0., 1. );
     F2[0] = +F1[0] + V3[0];
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc
index aa00d6a9e4..0a62a7059c 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h
index 3e29f2ccbe..2b51d933c5 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h
@@ -1,13 +1,13 @@
 // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors.
 // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend.
 //==========================================================================
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 //==========================================================================
 // This file has been automatically generated for CUDA/C++ standalone by
-// MadGraph5_aMC@NLO v. 3.6.0, 2024-09-30
+// MadGraph5_aMC@NLO v. 3.6.3, 2025-06-12
 // By the MadGraph5_aMC@NLO Development Team
 // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch
 //==========================================================================
@@ -878,7 +878,7 @@ namespace mg5amcCpu
 #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <<warning: variable ‘couplings_sv’ set but not used [-Wunused-but-set-variable]>>
 #endif
   // Compute the output couplings (e.g. gc10 and gc11) from the input gs
-  template<class G_ACCESS, class C_ACCESS>
+  template<class G_ACCESS, class CD_ACCESS>
   __device__ inline void
   G2COUP( const fptype gs[],
           fptype couplings[],
@@ -888,10 +888,10 @@ namespace mg5amcCpu
     using namespace Parameters_MSSM_SLHA2_dependentCouplings;
     const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs );
     DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr );
-    fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
-    fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 );
-    cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s );
-    cxtype_sv_ref GC_51s_sv = C_ACCESS::kernelAccess( GC_51s );
+    fptype* GC_6s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 );
+    fptype* GC_51s = CD_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 );
+    cxtype_sv_ref GC_6s_sv = CD_ACCESS::kernelAccess( GC_6s );
+    cxtype_sv_ref GC_51s_sv = CD_ACCESS::kernelAccess( GC_51s );
     GC_6s_sv = couplings_sv.GC_6;
     GC_51s_sv = couplings_sv.GC_51;
     mgDebug( 1, __FUNCTION__ );
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
index d3c4ca5695..8a2804aa04 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCONFIG_H
 #define MGONGPUCONFIG_H 1
@@ -74,6 +74,7 @@
 #define MGONGPU_FPTYPE2_DOUBLE 1 // default
 //#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster
 #endif
+
 // Choose whether to inline all HelAmps functions
 // This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229)
 // By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS
@@ -108,10 +109,23 @@
 #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float)
 #endif
 
+// Choose if cuBLAS and hipBLAS are supported for generating random numbers
+// For both CUDA and HIP, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_BLAS
+// (there may exist CUDA/HIP installations, e.g. using the HPC package, which do not include cuBLAS/hipBLAS?)
+#ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#elif defined __HIPCC__
+//#undef MGONGPU_HAS_NO_BLAS // default
+////#define MGONGPU_HAS_NO_BLAS 1
+#else
+#define MGONGPU_HAS_NO_BLAS 1
+#endif
+
 // CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
 #ifdef __CUDACC__ // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
 #undef MGONGPU_NSIGHT_DEBUG // default in CUDA
-//#define MGONGPU_NSIGHT_DEBUG 1
+//#define MGONGPU_NSIGHT_DEBUG 1 // CURRENTLY NO LONGER SUPPORTED!
 #else
 #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++
 #endif /* clang-format on */
@@ -232,19 +246,19 @@ using mgOnGpu::fptype2;
 #endif
 
 /* clang-format off */
-// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation
+// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation [NB: CURRENTLY NO LONGER SUPPORTED!]
 // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end)
-#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
-#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
-#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
-#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
-#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
-#else
+//#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG // this must be __CUDACC__ (not MGONGPUCPP_GPUIMPL)
+//#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX];
+//#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; }
+//#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; }
+//#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); }
+//#else
 #define mgDebugDeclare() /*noop*/
-#define mgDebugInitialise() { /*noop*/ }
-#define mgDebug( code, text ) { /*noop*/ }
-#define mgDebugFinalise() { /*noop*/ }
-#endif /* clang-format on */
+#define mgDebugInitialise() /*noop*/
+#define mgDebug( code, text ) /*noop*/
+#define mgDebugFinalise() /*noop*/
+//#endif /* clang-format on */
 
 // Define empty CUDA/HIP declaration specifiers for C++
 #ifndef MGONGPUCPP_GPUIMPL
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h
index 92d74fd6db..e98e925f2a 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h
@@ -1,7 +1,7 @@
-// Copyright (C) 2020-2024 CERN and UCLouvain.
+// Copyright (C) 2020-2025 CERN and UCLouvain.
 // Licensed under the GNU Lesser General Public License (version 3 or later).
 // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin.
-// Further modified by: J. Teig, A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+// Further modified by: J. Teig, A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 #ifndef MGONGPUCXTYPES_H
 #define MGONGPUCXTYPES_H 1
@@ -717,12 +717,24 @@ namespace mg5amcCpu
       : m_preal( &r ), m_pimag( &i ) {} // copy (create from) const refs
     cxtype_ref& operator=( const cxtype_ref& ) = delete;
     //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary
-    __host__ __device__ cxtype_ref& operator=( const cxtype& c )
+    __host__ __device__ cxtype_ref& operator=( const cxtype& c ) // copy (assign) const values
     {
       *m_preal = cxreal( c );
       *m_pimag = cximag( c );
       return *this;
-    } // copy (assign) non-const values
+    }
+    __host__ __device__ cxtype_ref& operator+=( const cxtype& c )
+    {
+      *m_preal += cxreal( c );
+      *m_pimag += cximag( c );
+      return *this;
+    }
+    __host__ __device__ cxtype_ref& operator-=( const cxtype& c )
+    {
+      *m_preal -= cxreal( c );
+      *m_pimag -= cximag( c );
+      return *this;
+    }
     __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); }
   private:
     fptype* const m_preal; // const pointer to non-const fptype R
diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
index f703a1ae7c..1f9f8bbc46 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk
@@ -1,7 +1,7 @@
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin.
-# Further modified by: A. Valassi (2020-2024) for the CUDACPP plugin.
+# Further modified by: S. Roiser, A. Valassi (2020-2025) for the CUDACPP plugin.
 
 THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
@@ -19,7 +19,7 @@ CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11
 all: googletest/$(INSTALLDIR)/lib64/libgtest.a
 
 googletest/CMakeLists.txt:
-	git clone https://github.com/google/googletest.git -b release-1.11.0 googletest
+	git clone https://github.com/google/googletest.git -b v1.17.0 googletest
 
 googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt
 	mkdir -p googletest/$(BUILDDIR)
diff --git a/epochX/cudacpp/tmad/allTees.sh b/epochX/cudacpp/tmad/allTees.sh
index eb39e2b302..89d91e7096 100755
--- a/epochX/cudacpp/tmad/allTees.sh
+++ b/epochX/cudacpp/tmad/allTees.sh
@@ -1,23 +1,41 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 scrdir=$(cd $(dirname $0); pwd)
 
 host=$(hostname)
 if [ "${host/juwels}" != "${host}" ]; then ${scrdir}/juwelspatch.sh; fi # workaround for #498
 
+# Usage
+function usage()
+{
+  echo "Usage (1): $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [+10x] [-hip]"
+  echo "Run tests and check all logs"
+  echo ""
+  echo "Usage (2): $0 -checkonly"
+  echo "Check existing logs without running any tests"
+  exit 1
+}
+
+# Parse command line arguments
+checkonly=0
 short=0
 bsm=
 flts=-dmf # "d m f" (alternative: -d_f i.e. "d f")
 makeclean=
 rmrdat=
-add10x="+10x"
+add10x=
 hip=
-
-while [ "$1" != "" ]; do
+if [ "$1" == "-checkonly" ]; then
+  # Check existing logs without running any tests?
+  checkonly=1
+  shift
+  if [ "$1" != "" ]; then usage; fi
+fi
+while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do
   if [ "$1" == "-short" ]; then
     short=1 # all (possibly including bsm) but ggttggg
     shift
@@ -27,8 +45,8 @@ while [ "$1" != "" ]; do
   elif [ "$1" == "-makeclean" ]; then
     makeclean=$1
     shift
-  elif [ "$1" == "-no10x" ]; then
-    add10x=""
+  elif [ "$1" == "+10x" ]; then
+    add10x=$1
     shift
   elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then
     bsm=$1
@@ -40,42 +58,72 @@ while [ "$1" != "" ]; do
     hip=$1
     shift
   else
-    echo "Usage: $0 [-short|-ggttggg] [-bsmonly|-nobsm] [-makeclean] [-no10x] [-hip]"
-    exit 1
+    usage
   fi
 done
 
-started="STARTED  AT $(date)"
-
-if [ "${bsm}" != "-bsmonly" ]; then
-  if [ "$short" == "1" ]; then
-    ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip
-  elif [ "$short" == "-1" ]; then
-    ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip
-  else
-    ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip
+# Run all tests
+if [ "${checkonly}" == "0" ]; then
+  started="STARTED  AT $(date)"
+  # SM tests
+  if [ "${bsm}" != "-bsmonly" ]; then
+    if [ "$short" == "1" ]; then
+      ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq $flts $makeclean $rmrdat $add10x $hip
+    elif [ "$short" == "-1" ]; then
+      ${scrdir}/teeMadX.sh -ggttggg $flts $makeclean $rmrdat $add10x $hip
+    else
+      ${scrdir}/teeMadX.sh -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg $flts $makeclean $rmrdat $add10x $hip
+    fi
   fi
-fi
-status=$?
-ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]"
-
-if [ "${bsm}" != "-nobsm" ]; then
-  if [ "$short" != "-1" ]; then
-    ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip
+  status=$?
+  ended1="(SM tests)\nENDED(1) AT $(date) [Status=$status]"
+  # BSM tests
+  if [ "${bsm}" != "-nobsm" ]; then
+    if [ "$short" != "-1" ]; then
+      ${scrdir}/teeMadX.sh -heftggbb -susyggtt -susyggt1t1 -smeftggtttt $flts $makeclean $rmrdat $add10x $hip
+    fi
   fi
+  status=$?
+  ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]"
+  # Timing information
+  echo
+  printf "\n%80s\n" |tr " " "#"
+  echo
+  echo -e "$started"
+  echo -e "$ended1"
+  echo -e "$ended2"
+  echo
 fi
-status=$?
-ended2="(BSM tests)\nENDED(1) AT $(date) [Status=$status]"
 
 # Print out the number of "OK!"s in each log (expect 24)
+for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK  | wc -l) $f; done # expect 24
+
+# Print out any errors or aborts in the logs
 echo
-printf "\n%80s\n" |tr " " "#"
+txt=$(egrep -i '(error|abort)' tmad/logs* -r | sed 's/:0:rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/' | sed "s/Gpu.*Assert/Assert/")
+if [ "${txt}" == "" ]; then
+  echo "No errors or aborts found in logs"
+else
+  echo "${txt}"
+fi
+
+# Print out any asserts in the logs
 echo
-echo -e "$started"
-echo -e "$ended1"
-echo -e "$ended2"
+txt=$(grep assert tmad/logs* -r | sed "s/Gpu.*Assert/Assert/")
+if [ "${txt}" == "" ]; then
+  echo "No asserts found in logs"
+else
+  echo "${txt}"
+fi
+  
+# Print out any segfaults in the logs
 echo
-for f in ${scrdir}/logs_*_mad/log_*; do echo $(cat $f | grep OK  | wc -l) $f; done # expect 24
+txt=$(grep -i segmentation tmad/logs* -r | sed "s/Gpu.*Assert/Assert/")
+if [ "${txt}" == "" ]; then
+  echo "No segmentation fault found in logs"
+else
+  echo "${txt}"
+fi
 
 # Print out the MEK channelid debugging output
 echo
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index c9c9460105..d835a5038f 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
-
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:14
+DATE: 2025-09-24_09:40:20
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3837 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7444s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7368s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7611s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7533s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2176s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2101s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2226s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2147s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2197s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2123s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0071s for     8192 events => throughput is 1.15E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2229s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2149s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.158620e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.104818e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.163690e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.106592e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173944E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2173s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for     8192 events => throughput is 1.81E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2226s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2171s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0052s for     8192 events => throughput is 1.58E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173944E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.887925e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.682136e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.991506e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.752662e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2160s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2124s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.50E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2201s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2165s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.47E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.590914e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.307123e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.667984e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.575803e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2167s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.46E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2187s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2153s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.58E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.636316e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.703640e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.730901e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.854606e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2188s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2145s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0041s for     8192 events => throughput is 2.01E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2193s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2142s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0048s for     8192 events => throughput is 1.70E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789448173971E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.085135e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.807316e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.218811e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.926326e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173971E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6526s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6492s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.88E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.6534s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6489s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.11E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0038s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -348,44 +355,44 @@ OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789448173971E-00
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.299210e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.643554e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.632885e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.396194e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.507229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.031975e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.868548e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.648185e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543060e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.033996e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.911449e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.933419e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.533062e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.053853e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.164979e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.238409e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index 13ceac3a87..739cb36246 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:39
+DATE: 2025-09-24_09:40:47
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3837 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7443s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7370s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7601s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7524s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2183s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2108s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0075s for     8192 events => throughput is 1.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2228s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2150s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432777382586498E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432777448196335E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2266s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2197s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0067s for     8192 events => throughput is 1.21E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
+ [COUNTERS] PROGRAM TOTAL          :    0.2224s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2147s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0074s for     8192 events => throughput is 1.10E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777382586498E-002) differ by less than 4E-4 (1.305336294610271e-07)
+OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432777448196335E-002) differ by less than 4E-4 (1.298238181401956e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.221258e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.105792e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.225429e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.149836e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432774839452045E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432774879426222E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2220s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2190s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.89E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2190s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2156s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.56E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774839452045E-002) differ by less than 4E-4 (1.5804696607002455e-07)
+OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774879426222E-002) differ by less than 4E-4 (1.5761449856377396e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.137547e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.737799e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.221144e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.881039e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432774837279630E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2228s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2200s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0026s for     8192 events => throughput is 3.13E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2197s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2177s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.49E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07)
+OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774837279630E-002) differ by less than 4E-4 (1.5807046871429975e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.328121e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.997328e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556846e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.092481e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432774915924193E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432774837279630E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2241s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2212s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.09E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2195s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2175s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.62E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774915924193E-002) differ by less than 4E-4 (1.5721963908532643e-07)
+OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432774837279630E-002) differ by less than 4E-4 (1.5807046871429975e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.452418e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.992748e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.604389e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.419746e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432778556608516E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432778581375011E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2173s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2144s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.08E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2209s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2182s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0025s for     8192 events => throughput is 3.30E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778556608516E-002) differ by less than 4E-4 (1.1783227071848756e-07)
+OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432778581375011E-002) differ by less than 4E-4 (1.1756433015985834e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.402847e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.799518e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.641263e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.047783e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432780016531851E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432779972212775E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6500s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6467s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.92E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.6551s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6505s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.21E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0039s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432780016531851E-002) differ by less than 4E-4 (1.0203783951112655e-07)
+OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432779972212775E-002) differ by less than 4E-4 (1.0251731308308365e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.451436e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.269152e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.688055e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.461078e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.014252e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.890598e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.229387e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.106832e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.787718e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.912802e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.220221e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.096461e+08                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.380548e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.776097e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.826286e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.098447e+08                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 093bec81e5..ee7fc520cb 100644
--- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:27
+DATE: 2025-09-24_09:40:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 3837 events (found 8192 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7605s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7527s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0078s for     8192 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7587s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7510s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789448173985E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2144s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0077s for     8192 events => throughput is 1.06E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2225s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2147s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0078s for     8192 events => throughput is 1.04E+06 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2212s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2136s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0073s for     8192 events => throughput is 1.12E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2242s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2159s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0079s for     8192 events => throughput is 1.04E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.133245e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.069043e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.115304e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.096393e+06                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444986618E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2168s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2123s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0042s for     8192 events => throughput is 1.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2223s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2170s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0050s for     8192 events => throughput is 1.64E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444986618E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.993139e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.684928e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.058944e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.852659e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2174s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2138s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0034s for     8192 events => throughput is 2.42E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2193s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2154s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0036s for     8192 events => throughput is 2.29E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.549665e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.593852e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.708708e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.824521e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2179s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2143s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.49E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2171s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2137s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.71E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.606715e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.864576e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.748967e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.168112e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.09243 [9.2432789444494415E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.2165s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2123s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0039s for     8192 events => throughput is 2.10E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.2189s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2140s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0046s for     8192 events => throughput is 1.80E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (9.2432789448173985E-002) and cpp (9.2432789444494415E-002
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.203720e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.878395e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.284212e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.905957e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 2 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.09243 [9.2432789437826970E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.09243 [9.2432789437826984E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 1589 events (found 1593 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6505s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6470s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.77E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.6576s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6531s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.09E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0038s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789437826970E-002) differ by less than 2E-4 (1.1194101201539297e-10)
+OK! xsec from fortran (9.2432789448173985E-002) and cuda (9.2432789437826984E-002) differ by less than 2E-4 (1.1194078997078805e-10)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.269035e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.635045e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.550305e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.290600e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.523745e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.038238e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.857337e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.623999e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.551254e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.057820e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.897534e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.893156e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.503798e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.068143e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.184430e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.257736e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 794f102690..3f2f6b7530 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cppnone
 
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:04:53
+DATE: 2025-09-24_09:40:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 2613 events (found 5374 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8494s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8073s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0421s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8577s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8141s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0436s for     8192 events => throughput is 1.88E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4510s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4085s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0425s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4569s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4133s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0435s for     8192 events => throughput is 1.88E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4555s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4098s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0453s for     8192 events => throughput is 1.81E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4659s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4181s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0474s for     8192 events => throughput is 1.73E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.856020e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.799101e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.865986e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.804128e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4352s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4103s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0245s for     8192 events => throughput is 3.34E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4444s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4166s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0274s for     8192 events => throughput is 2.99E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.314758e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.072714e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.321531e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.083395e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4235s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4077s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0154s for     8192 events => throughput is 5.30E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4338s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4171s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0163s for     8192 events => throughput is 5.03E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.263509e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.875847e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.327379e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.001397e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4237s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4087s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0147s for     8192 events => throughput is 5.58E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4322s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4161s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0157s for     8192 events => throughput is 5.21E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034162) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.648502e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.170619e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.831851e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.464737e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138611968034169] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4297s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4071s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0222s for     8192 events => throughput is 3.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4392s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4144s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0244s for     8192 events => throughput is 3.36E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138611968034169) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138611968034169) differ by less than 3E-14 (1.1102230246251565e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.526689e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.424710e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.574003e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.469326e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8534s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8496s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.68E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8664s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8607s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0013s for     8192 events => throughput is 6.31E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0044s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cuda (47.138611968034176) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (47.138611968034176) and cuda (47.138611968034162) differ by less than 3E-14 (3.3306690738754696e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.103830e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.022933e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.448285e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.070706e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.875229e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.914638e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.627647e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036305e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.886865e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.920677e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.006782e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.071087e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.862106e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.917421e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.715892e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621738e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index 2bf2a37cc7..0353138ae2 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 
 make USEBUILDDIR=1 BACKEND=cuda
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:22
+DATE: 2025-09-24_09:41:29
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 2613 events (found 5374 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8450s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8027s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0423s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8551s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8118s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0433s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4516s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4094s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4590s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4153s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0437s for     8192 events => throughput is 1.87E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138606099989779] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4548s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4118s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0426s for     8192 events => throughput is 1.92E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4597s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4149s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0444s for     8192 events => throughput is 1.84E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487851646206e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138606099989779) differ by less than 4E-4 (1.2448487873850667e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.973574e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.883634e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.981282e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.889115e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138602111070696] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4326s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4154s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0170s for     8192 events => throughput is 4.81E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
+ [COUNTERS] PROGRAM TOTAL          :    0.4365s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4171s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0191s for     8192 events => throughput is 4.29E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059336795098e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138602111070696) differ by less than 4E-4 (2.091059339015544e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.659841e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.331462e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.743814e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.375376e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138602536968548] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4174s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4080s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0091s for     8192 events => throughput is 8.97E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4247s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4155s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0089s for     8192 events => throughput is 9.23E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138602536968548) differ by less than 4E-4 (2.0007092349505484e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.079796e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.572016e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.235810e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.493727e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138602499179925] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138602536968548] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4181s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4092s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.50E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4242s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4155s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0085s for     8192 events => throughput is 9.69E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138602499179925) differ by less than 4E-4 (2.008725722424387e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138602536968548) differ by less than 4E-4 (2.0007092349505484e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.970038e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.743951e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.765544e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.946551e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138606840950104] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138606859855095] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4258s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0124s for     8192 events => throughput is 6.60E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4274s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4154s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0117s for     8192 events => throughput is 6.98E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138606840950104) differ by less than 4E-4 (1.0876612277499476e-07)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138606859855095) differ by less than 4E-4 (1.083650720268281e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.636236e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.335193e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.862568e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.440519e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138612402172164] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138612410631097] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8671s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8634s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.65E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.8632s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8580s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0010s for     8192 events => throughput is 8.31E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0042s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cuda (47.138612402172164) differ by less than 4E-4 (9.209817353195149e-09)
+OK! xsec from fortran (47.138611968034176) and cuda (47.138612410631097) differ by less than 4E-4 (9.389264921111362e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.093880e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.444767e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.450343e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.714328e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.021092e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.591418e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.359313e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.094164e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.014796e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.593199e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.375647e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.068726e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.628808e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.570569e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.004427e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.377359e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 2ae843d323..92ff8109b4 100644
--- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
+
 make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:08
+DATE: 2025-09-24_09:41:14
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 2613 events (found 5374 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8439s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8015s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0424s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8555s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8122s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0433s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611968034162] fbridge_mode=0
+ [XSECTION] Cross section = 47.14 [47.138611968034176] fbridge_mode=0
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4498s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4080s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0418s for     8192 events => throughput is 1.96E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4550s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4116s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0434s for     8192 events => throughput is 1.89E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613306947967] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4576s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4121s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0451s for     8192 events => throughput is 1.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4663s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4186s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0473s for     8192 events => throughput is 1.73E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759566586473e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947967) differ by less than 2E-4 (2.8403759344541868e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.815647e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.782893e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.845071e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.785685e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613306947953] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4358s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4106s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for     8192 events => throughput is 3.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4477s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4206s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0267s for     8192 events => throughput is 3.07E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759344541868e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613306947953) differ by less than 2E-4 (2.8403759122497263e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.291111e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.088238e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.339005e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.130119e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4251s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4094s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0154s for     8192 events => throughput is 5.31E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4334s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4169s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0161s for     8192 events => throughput is 5.09E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.315398e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.041293e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.422217e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.062752e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4227s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4081s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0143s for     8192 events => throughput is 5.73E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4313s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4156s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0153s for     8192 events => throughput is 5.35E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593434756825e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.854463e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.522361e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.901611e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.428731e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138613350418019] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138613350418026] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4322s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4099s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0219s for     8192 events => throughput is 3.74E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4399s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4158s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0237s for     8192 events => throughput is 3.45E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cpp (47.138613350418019) differ by less than 2E-4 (2.932593479165746e-08)
+OK! xsec from fortran (47.138611968034176) and cpp (47.138613350418026) differ by less than 2E-4 (2.9325934569612855e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.724588e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.523885e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.694617e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.520339e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 47.14 [47.138611963547788] fbridge_mode=1
+ [XSECTION] Cross section = 47.14 [47.138613301020499] fbridge_mode=1
  [UNWEIGHT] Wrote 1618 events (found 1623 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8506s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8468s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8657s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8600s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0012s for     8192 events => throughput is 6.56E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0044s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (47.138611968034162) and cuda (47.138611963547788) differ by less than 2E-4 (9.517409083059647e-11)
+OK! xsec from fortran (47.138611968034176) and cuda (47.138613301020499) differ by less than 2E-4 (2.8278013930460588e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.987528e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.074070e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.325954e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.013331e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.868584e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.920469e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.589038e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036622e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.871326e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.893637e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.949192e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.072188e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.873573e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.913523e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.717025e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621290e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 0c7ed732ed..4e7ca0cd21 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
-make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
+make USEBUILDDIR=1 BACKEND=cuda
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:36
+DATE: 2025-09-24_09:41:44
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 387 events (found 1591 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7416s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4124s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3292s for     8192 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7572s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4146s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3426s for     8192 events => throughput is 2.39E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7177s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3873s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3304s for     8192 events => throughput is 2.48E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7338s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3903s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3435s for     8192 events => throughput is 2.38E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7353s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3872s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3470s for     8192 events => throughput is 2.36E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.7520s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3938s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3570s for     8192 events => throughput is 2.29E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.455924e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.356249e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.454100e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.346054e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5656s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3857s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1793s for     8192 events => throughput is 4.57E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
+ [COUNTERS] PROGRAM TOTAL          :    0.5864s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3914s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1942s for     8192 events => throughput is 4.22E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748553E-002) differ by less than 3E-14 (0.0)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.669927e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.278626e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.620836e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.269078e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471485809748540E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4792s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3884s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0903s for     8192 events => throughput is 9.07E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4900s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3924s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0970s for     8192 events => throughput is 8.45E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748540E-002) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.331277e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.703454e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.327490e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.541417e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748595E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471485809748540E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4693s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3876s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0812s for     8192 events => throughput is 1.01E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4796s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3912s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0878s for     8192 events => throughput is 9.33E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748595E-002) differ by less than 3E-14 (4.440892098500626e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748540E-002) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.048553e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.482223e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.042752e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.443069e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748581E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5035s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3879s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1150s for     8192 events => throughput is 7.13E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5127s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3945s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1175s for     8192 events => throughput is 6.97E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471485809748581E-002) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471485809748567E-002) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.198283e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.098352e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.275587e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.113802e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8395s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8270s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0095s for     8192 events => throughput is 8.61E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0030s
+ [COUNTERS] PROGRAM TOTAL          :    0.8451s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8302s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0068s for     8192 events => throughput is 1.20E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0081s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471485809748553E-002) differ by less than 3E-14 (0.0)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111479e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.216712e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.523607e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.331258e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 8192 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.454522e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.110985e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 8192 32 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.167720e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.131706e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.412863e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111080e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.174227e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.157398e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.441638e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112521e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.653840e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.025597e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index fbc0c57cb4..640f442f94 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:06:11
+DATE: 2025-09-24_09:42:22
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 387 events (found 1591 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7420s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4102s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3318s for     8192 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7606s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4162s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3444s for     8192 events => throughput is 2.38E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7176s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3860s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3316s for     8192 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7336s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3898s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3438s for     8192 events => throughput is 2.38E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471473453718410E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471473940337211E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7234s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3899s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3325s for     8192 events => throughput is 2.46E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
+ [COUNTERS] PROGRAM TOTAL          :    0.7380s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3930s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3440s for     8192 events => throughput is 2.38E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471473453718410E-002) differ by less than 4E-4 (1.574588530672827e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471473940337211E-002) differ by less than 4E-4 (1.5125763475065668e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.535876e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.432966e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.542086e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.427997e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471459294758378E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471459242542743E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4904s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3886s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1014s for     8192 events => throughput is 8.08E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4989s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3918s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1065s for     8192 events => throughput is 7.69E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459294758378E-002) differ by less than 4E-4 (3.37893311330717e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459242542743E-002) differ by less than 4E-4 (3.385587202808793e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.182689e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.892225e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.204950e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.896609e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471459599782634E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4358s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3891s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0464s for     8192 events => throughput is 1.77E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4470s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3963s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0503s for     8192 events => throughput is 1.63E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459599782634E-002) differ by less than 4E-4 (3.340062399992405e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.782969e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.654755e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.783579e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.675872e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471459718665412E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471459599782634E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4301s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3871s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0426s for     8192 events => throughput is 1.92E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4423s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3934s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0484s for     8192 events => throughput is 1.69E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471459718665412E-002) differ by less than 4E-4 (3.324912595248364e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471459599782634E-002) differ by less than 4E-4 (3.340062399992405e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.968891e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.853168e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.969858e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.860470e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471471932611128E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471471449789984E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4447s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3892s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0551s for     8192 events => throughput is 1.49E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.4496s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3920s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0571s for     8192 events => throughput is 1.43E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471471932611128E-002) differ by less than 4E-4 (1.768430569759616e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471471449789984E-002) differ by less than 4E-4 (1.8299587956072116e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.481854e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.461497e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.468460e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.463643e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471475012321185E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471471527735093E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8373s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8327s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.36E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.8445s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8332s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0038s for     8192 events => throughput is 2.14E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0074s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471475012321185E-002) differ by less than 4E-4 (1.375968260441951e-07)
+OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471471527735093E-002) differ by less than 4E-4 (1.8200258744549558e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.717098e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.317074e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.890243e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.450306e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 8192 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.313606e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.688503e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 8192 32 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.232701e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.686499e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.300307e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.685793e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.230438e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.646028e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.193713e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.689493e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.247962e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.483427e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 2422d3068f..93f6dedb27 100644
--- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
+
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:05:53
+DATE: 2025-09-24_09:42:03
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 387 events (found 1591 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7391s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4096s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3295s for     8192 events => throughput is 2.49E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7572s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4143s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3429s for     8192 events => throughput is 2.39E+04 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485809748567E-002] fbridge_mode=0
+ [XSECTION] Cross section = 0.07847 [7.8471485809748553E-002] fbridge_mode=0
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7165s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3855s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.3310s for     8192 events => throughput is 2.47E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7324s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3899s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.3424s for     8192 events => throughput is 2.39E+04 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471486590207584E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486590207598E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7396s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3874s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3510s for     8192 events => throughput is 2.33E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.7532s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3918s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3602s for     8192 events => throughput is 2.27E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486590207584E-002) differ by less than 2E-4 (9.945765766516956e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486590207598E-002) differ by less than 2E-4 (9.945766210606166e-09)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.409349e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.326368e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.415956e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.324676e+04                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471486540430027E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486557993325E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5676s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3876s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1792s for     8192 events => throughput is 4.57E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
+ [COUNTERS] PROGRAM TOTAL          :    0.5817s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3911s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1899s for     8192 events => throughput is 4.31E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486540430027E-002) differ by less than 2E-4 (9.311426296676473e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486557993325E-002) differ by less than 2E-4 (9.535244149816435e-09)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.653483e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.378332e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.691370e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.388241e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486463614223E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4809s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3907s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0896s for     8192 events => throughput is 9.14E+04 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4866s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3913s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0947s for     8192 events => throughput is 8.65E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614223E-002) differ by less than 2E-4 (8.332525780474498e-09)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.402724e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.875956e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.391101e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.709061e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471486395956899E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486463614223E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4660s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3858s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0796s for     8192 events => throughput is 1.03E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4783s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3908s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0870s for     8192 events => throughput is 9.42E+04 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486395956899E-002) differ by less than 2E-4 (7.470335683379403e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486463614223E-002) differ by less than 2E-4 (8.332525780474498e-09)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.055172e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.543350e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.066925e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.670798e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.07847 [7.8471486537749241E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5026s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3850s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1170s for     8192 events => throughput is 7.00E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.5086s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3920s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.1159s for     8192 events => throughput is 7.07E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277263846030337e-09)
+OK! xsec from fortran (7.8471485809748553E-002) and cpp (7.8471486537749241E-002) differ by less than 2E-4 (9.277264068074942e-09)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.005425e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.165999e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.056979e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.185185e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 16 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.07847 [7.8471485791426987E-002] fbridge_mode=1
+ [XSECTION] Cross section = 0.07847 [7.8471486423885309E-002] fbridge_mode=1
  [UNWEIGHT] Wrote 376 events (found 1358 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8432s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8306s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0095s for     8192 events => throughput is 8.66E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0031s
+ [COUNTERS] PROGRAM TOTAL          :    0.8467s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8318s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0068s for     8192 events => throughput is 1.20E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0081s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.8471485809748567E-002) and cuda (7.8471485791426987E-002) differ by less than 2E-4 (2.334807902570901e-10)
+OK! xsec from fortran (7.8471485809748553E-002) and cuda (7.8471486423885309E-002) differ by less than 2E-4 (7.826240988606514e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.128450e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.212267e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.439893e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.293617e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 8192 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.421024e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111513e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 8192 32 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.153444e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.131049e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.432988e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112576e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 2048 128 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.169695e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.159084e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.432146e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.112106e+06                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 32768 8 1 ***
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.638179e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.027373e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index 5517ab4292..5c8e19e910 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:06:26
+DATE: 2025-09-24_09:42:39
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 223 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6353s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3093s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3260s for     8192 events => throughput is 1.89E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8674s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3029s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5645s for     8192 events => throughput is 1.79E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5825s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2904s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2921s for     8192 events => throughput is 1.91E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8604s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2983s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5621s for     8192 events => throughput is 1.80E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.7512s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.4476s for     8192 events => throughput is 1.84E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0090s
+ [COUNTERS] PROGRAM TOTAL          :    4.9836s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2961s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.6779s for     8192 events => throughput is 1.75E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0096s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.894558e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.790589e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.891638e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.786472e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786561240180] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6638s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2927s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3665s for     8192 events => throughput is 3.46E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0045s
+ [COUNTERS] PROGRAM TOTAL          :    2.8513s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2953s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.5508s for     8192 events => throughput is 3.21E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240180) differ by less than 3E-14 (5.551115123125783e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.547129e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.252169e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.542201e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.243534e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3332s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0356s for     8192 events => throughput is 7.91E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
+ [COUNTERS] PROGRAM TOTAL          :    1.4460s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2978s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1455s for     8192 events => throughput is 7.15E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.118919e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.371211e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.114943e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.386101e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.2181s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9207s for     8192 events => throughput is 8.90E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0022s
+ [COUNTERS] PROGRAM TOTAL          :    1.3562s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2953s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0584s for     8192 events => throughput is 7.74E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.276674e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.283097e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.241984e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.316461e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4646s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2912s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1708s for     8192 events => throughput is 7.00E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0026s
+ [COUNTERS] PROGRAM TOTAL          :    1.5549s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2975s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2543s for     8192 events => throughput is 6.53E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0031s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786561240197) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.083404e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.637926e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.099846e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.577309e+03                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786561240192] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8110s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7374s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0388s for     8192 events => throughput is 2.11E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0348s
+ [COUNTERS] PROGRAM TOTAL          :    0.8768s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7472s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0731s for     8192 events => throughput is 1.12E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0565s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240192) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786561240197) differ by less than 3E-14 (0.0)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.149005e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.128754e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.350783e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.152335e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.129093e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.031773e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.172100e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.039843e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.126645e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032424e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.170032e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.039735e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.144552e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036401e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.426547e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.128114e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index 78567e12c9..dc699c27b6 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 make USEBUILDDIR=1 BACKEND=cuda
-make USEBUILDDIR=1 BACKEND=cppnone
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
+
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:08:49
+DATE: 2025-09-24_09:44:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 223 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5864s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2940s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.2923s for     8192 events => throughput is 1.91E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8605s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2988s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5616s for     8192 events => throughput is 1.80E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5924s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2886s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3038s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8552s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2967s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5585s for     8192 events => throughput is 1.80E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144941544531159] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144941829360230] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6210s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2941s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.3184s for     8192 events => throughput is 1.90E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0085s
+ [COUNTERS] PROGRAM TOTAL          :    4.9162s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.6117s for     8192 events => throughput is 1.78E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0093s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941544531159) differ by less than 4E-4 (4.675947774535061e-06)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144941829360230) differ by less than 4E-4 (4.684541254906804e-06)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.957206e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.821921e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.957921e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.823714e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144937378275385] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144937034821881] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4924s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2933s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1966s for     8192 events => throughput is 6.85E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0025s
+ [COUNTERS] PROGRAM TOTAL          :    1.5793s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2964s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2802s for     8192 events => throughput is 6.40E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937378275385) differ by less than 4E-4 (4.550249099066761e-06)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144937034821881) differ by less than 4E-4 (4.539886881094191e-06)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.048957e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.669582e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.041651e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.614973e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144939883924923] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8128s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2926s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5189s for     8192 events => throughput is 1.58E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0013s
+ [COUNTERS] PROGRAM TOTAL          :    0.8721s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2946s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5761s for     8192 events => throughput is 1.42E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939883924923) differ by less than 4E-4 (4.62584619276285e-06)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.622272e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.458780e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.613287e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.456486e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144939353225550] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144939883924923] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7779s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2950s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.4817s for     8192 events => throughput is 1.70E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
+ [COUNTERS] PROGRAM TOTAL          :    0.8324s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2951s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5359s for     8192 events => throughput is 1.53E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0014s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939353225550) differ by less than 4E-4 (4.609834643787281e-06)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144939883924923) differ by less than 4E-4 (4.62584619276285e-06)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.826080e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.615038e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.802534e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.617243e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144947551388249] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144947512238093] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8771s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2920s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5836s for     8192 events => throughput is 1.40E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
+ [COUNTERS] PROGRAM TOTAL          :    0.9197s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2991s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6189s for     8192 events => throughput is 1.32E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0017s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947551388249) differ by less than 4E-4 (4.857178601991308e-06)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144947512238093) differ by less than 4E-4 (4.855997415953439e-06)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.430502e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.336496e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.421428e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.341570e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144955535316123] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144805623008405] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7866s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7350s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0271s for     8192 events => throughput is 3.02E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0245s
+ [COUNTERS] PROGRAM TOTAL          :    0.8568s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7429s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0579s for     8192 events => throughput is 1.42E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0560s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cuda (0.33144955535316123) differ by less than 4E-4 (5.0980589545446264e-06)
+OK! xsec from fortran (0.33144786561240197) and cuda (0.33144805623008405) differ by less than 4E-4 (5.751060780934125e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.089397e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.427342e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.388762e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.450556e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.126017e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.235203e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.254976e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.256194e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.087410e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.240003e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.221892e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.276462e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.084262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.251882e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.392382e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.720302e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index 0f7d6f4131..80beb65efd 100644
--- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:07:37
+DATE: 2025-09-24_09:43:33
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 223 events)
- [COUNTERS] PROGRAM TOTAL          :    4.5989s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2922s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3067s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8647s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3010s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5637s for     8192 events => throughput is 1.80E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786561240197] fbridge_mode=0
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.6012s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2899s
- [COUNTERS] Fortran MEs      ( 1 ) :    4.3113s for     8192 events => throughput is 1.90E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    4.8662s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2985s
+ [COUNTERS] Fortran MEs      ( 1 ) :    4.5676s for     8192 events => throughput is 1.79E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786734542164] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    4.8059s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2941s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    4.5027s for     8192 events => throughput is 1.82E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0091s
+ [COUNTERS] PROGRAM TOTAL          :    5.0375s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2977s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    4.7301s for     8192 events => throughput is 1.73E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0097s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786734542164) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.881337e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.768670e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.867505e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.766915e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] ChannelId = 112
  [XSECTION] Cross section = 0.3314 [0.33144786651655289] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6829s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2920s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.3862s for     8192 events => throughput is 3.43E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0047s
+ [COUNTERS] PROGRAM TOTAL          :    2.7375s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2980s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4343s for     8192 events => throughput is 3.37E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786651655289) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.548157e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.422239e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.537868e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.438997e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786627894512] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.3285s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2936s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0325s for     8192 events => throughput is 7.93E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0024s
+ [COUNTERS] PROGRAM TOTAL          :    1.4216s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2955s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1235s for     8192 events => throughput is 7.29E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0026s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894512) differ by less than 2E-4 (2.0110044740562216e-09)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.171504e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.443598e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.183239e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.406915e+03                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786627894512] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.1999s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2927s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.9051s for     8192 events => throughput is 9.05E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0022s
+ [COUNTERS] PROGRAM TOTAL          :    1.3011s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2981s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.0006s for     8192 events => throughput is 8.19E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0024s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894512) differ by less than 2E-4 (2.0110044740562216e-09)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.165581e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.412011e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.350878e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.381821e+03                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786627894518] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786627894512] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    1.4750s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.2928s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1793s for     8192 events => throughput is 6.95E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0029s
+ [COUNTERS] PROGRAM TOTAL          :    1.5416s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.2972s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2413s for     8192 events => throughput is 6.60E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0030s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894518) differ by less than 2E-4 (2.0110046961008265e-09)
+OK! xsec from fortran (0.33144786561240197) and cpp (0.33144786627894512) differ by less than 2E-4 (2.0110044740562216e-09)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.035517e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.701255e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.843003e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.645821e+03                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 123 channels { 112 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 104
  [XSECTION] ChannelId = 112
- [XSECTION] Cross section = 0.3314 [0.33144786533876569] fbridge_mode=1
+ [XSECTION] Cross section = 0.3314 [0.33144786662983072] fbridge_mode=1
  [UNWEIGHT] Wrote 7 events (found 213 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8136s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7401s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0389s for     8192 events => throughput is 2.11E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0347s
+ [COUNTERS] PROGRAM TOTAL          :    0.8743s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.7450s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0728s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0566s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786533876569) differ by less than 2E-4 (8.255786054789382e-10)
+OK! xsec from fortran (0.33144786561240197) and cuda (0.33144786662983072) differ by less than 2E-4 (3.0696494235371574e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.142259e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.127471e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.350796e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.156691e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.127674e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.034512e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.154284e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.039347e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.123213e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.033378e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.173815e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036749e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.121978e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.037400e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.416494e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.126745e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 74862dd5f7..0e9d9d2f29 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
-
-make USEBUILDDIR=1 BACKEND=cuda
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 
-make USEBUILDDIR=1 BACKEND=cppsse4
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
+make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:10:25
+DATE: 2025-09-24_09:45:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.9475s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5365s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.4109s for     8192 events => throughput is 8.16E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.4483s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5357s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.9126s for     8192 events => throughput is 8.04E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.8105s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5296s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.2810s for     8192 events => throughput is 8.17E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.4309s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5346s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.8963s for     8192 events => throughput is 8.04E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  127.1376s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5284s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  126.4018s for     8192 events => throughput is 6.48E+01 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2074s
+ [COUNTERS] PROGRAM TOTAL          :  163.4088s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5347s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  162.5706s for     8192 events => throughput is 5.04E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.3035s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282475E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.678586e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.226735e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.694101e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.259225e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   61.7097s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5331s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   61.0765s for     8192 events => throughput is 1.34E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1001s
+ [COUNTERS] PROGRAM TOTAL          :   86.5586s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5341s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   85.8620s for     8192 events => throughput is 9.54E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1625s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.591189e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.949901e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.580161e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.747834e+01                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   29.3577s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5263s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   28.7837s for     8192 events => throughput is 2.85E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0477s
+ [COUNTERS] PROGRAM TOTAL          :   40.6226s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5340s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   40.0118s for     8192 events => throughput is 2.05E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0767s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.407090e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.131860e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.415212e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.159412e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   26.2469s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5271s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.6788s for     8192 events => throughput is 3.19E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0410s
+ [COUNTERS] PROGRAM TOTAL          :   37.6969s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5334s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   37.0922s for     8192 events => throughput is 2.21E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0712s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.913687e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.294972e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.895964e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.345904e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282467E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   26.1607s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5255s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.5871s for     8192 events => throughput is 3.20E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0481s
+ [COUNTERS] PROGRAM TOTAL          :   34.2348s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5355s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   33.6335s for     8192 events => throughput is 2.44E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0658s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561551282467E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.408791e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.512973e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.444614e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.511766e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.357e-07 [2.3572561551282475E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.357e-07 [2.3572561551282422E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :    3.3131s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1215s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1040s for     8192 events => throughput is 7.42E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    1.0875s
+ [COUNTERS] PROGRAM TOTAL          :    4.4192s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2237s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.8337s for     8192 events => throughput is 4.47E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    1.3618s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282475E-007) differ by less than 3E-14 (2.4424906541753444e-15)
+OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561551282422E-007) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.491511e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.501671e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.275455e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.526830e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.282089e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.504711e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.552042e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.533077e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.301465e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.507550e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.448921e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.566191e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.252906e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.504247e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.241973e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.878509e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index bfa4b4cda4..78f71e7886 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-
 make USEBUILDDIR=1 BACKEND=cppnone
 
-
 make USEBUILDDIR=1 BACKEND=cppsse4
+
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:42:40
+DATE: 2025-09-24_10:29:04
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.8152s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5282s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.2871s for     8192 events => throughput is 8.17E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.7811s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5356s
+ [COUNTERS] Fortran MEs      ( 1 ) :  102.2455s for     8192 events => throughput is 8.01E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.7247s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5322s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.1925s for     8192 events => throughput is 8.18E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.6282s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5399s
+ [COUNTERS] Fortran MEs      ( 1 ) :  102.0883s for     8192 events => throughput is 8.02E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -108,30 +114,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575849446922190E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.358e-07 [2.3575849656360290E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  112.7914s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5240s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  112.0829s for     8192 events => throughput is 7.31E+01 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1845s
+ [COUNTERS] PROGRAM TOTAL          :  149.9291s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5340s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  149.1165s for     8192 events => throughput is 5.49E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2786s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849446922190E-007) differ by less than 4E-4 (0.00013947977747852391)
+OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575849656360290E-007) differ by less than 4E-4 (0.00013948866230428791)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.631916e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.705349e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.625132e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.750941e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -146,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -154,30 +159,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575845178322101E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.358e-07 [2.3575845268372665E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   28.7980s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5271s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   28.2235s for     8192 events => throughput is 2.90E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0474s
+ [COUNTERS] PROGRAM TOTAL          :   39.0745s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5342s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   38.4658s for     8192 events => throughput is 2.13E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0745s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845178322101E-007) differ by less than 4E-4 (0.0001392986940575991)
+OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845268372665E-007) differ by less than 4E-4 (0.0001393025142009119)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.386203e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.225201e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.374145e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.224404e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -192,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -200,30 +204,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.358e-07 [2.3575845201396375E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   14.8120s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5245s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   14.2638s for     8192 events => throughput is 5.74E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0236s
+ [COUNTERS] PROGRAM TOTAL          :   20.3883s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5317s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   19.8182s for     8192 events => throughput is 4.13E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0383s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544)
+OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845201396375E-007) differ by less than 4E-4 (0.00013929967291903544)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.872770e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.313431e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.864576e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.334992e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -238,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -246,30 +249,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575845169411084E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.358e-07 [2.3575845201396375E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   13.3091s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5262s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.7618s for     8192 events => throughput is 6.42E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0211s
+ [COUNTERS] PROGRAM TOTAL          :   18.8938s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5335s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   18.3248s for     8192 events => throughput is 4.47E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0355s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845169411084E-007) differ by less than 4E-4 (0.0001392983160326544)
+OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575845201396375E-007) differ by less than 4E-4 (0.00013929967291903544)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.728743e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.590348e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.768099e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.692981e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -284,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 128/128
@@ -292,30 +294,30 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575850859831750E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.358e-07 [2.3575850881931771E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   13.2286s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5280s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   12.6780s for     8192 events => throughput is 6.46E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0225s
+ [COUNTERS] PROGRAM TOTAL          :   17.3656s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5340s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   16.7981s for     8192 events => throughput is 4.88E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0336s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850859831750E-007) differ by less than 4E-4 (0.00013953971621538663)
+OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3575850881931771E-007) differ by less than 4E-4 (0.0001395406537467725)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.948019e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.100289e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.969717e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.096706e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -337,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.358e-07 [2.3575862304433055E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.357e-07 [2.3572568179359759E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :    2.2079s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.1084s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5470s for     8192 events => throughput is 1.50E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.5524s
+ [COUNTERS] PROGRAM TOTAL          :    3.8207s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2702s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.1821s for     8192 events => throughput is 6.93E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    1.3684s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3575862304433055E-007) differ by less than 4E-4 (0.00014002522141920437)
+OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572568179359759E-007) differ by less than 4E-4 (2.8117764494517417e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.517499e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.967387e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.545233e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.769635e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.140576e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.948508e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.181453e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.760546e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.126165e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.948816e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.164632e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.904736e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.163932e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.951778e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.073078e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.835102e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index 3a68950921..ccd6dbacb7 100644
--- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 
 make USEBUILDDIR=1 BACKEND=cuda
@@ -9,33 +12,37 @@ make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:26:37
+DATE: 2025-09-24_10:07:44
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  101.1381s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5302s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.6080s for     8192 events => throughput is 8.14E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.4899s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5342s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.9557s for     8192 events => throughput is 8.03E+01 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561551282417E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  100.8808s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5357s
- [COUNTERS] Fortran MEs      ( 1 ) :  100.3451s for     8192 events => throughput is 8.16E+01 events/s
+ [COUNTERS] PROGRAM TOTAL          :  102.5084s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5366s
+ [COUNTERS] Fortran MEs      ( 1 ) :  101.9718s for     8192 events => throughput is 8.03E+01 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561678995975E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :  123.7239s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5356s
- [COUNTERS] CudaCpp MEs      ( 2 ) :  122.9787s for     8192 events => throughput is 6.66E+01 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2095s
+ [COUNTERS] PROGRAM TOTAL          :  164.8069s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5331s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :  163.9850s for     8192 events => throughput is 5.00E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.2889s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561678995975E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.634632e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.294826e+01                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.608909e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.254752e+01                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561701257335E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   64.5975s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5274s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   63.9661s for     8192 events => throughput is 1.28E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1041s
+ [COUNTERS] PROGRAM TOTAL          :   85.1648s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5347s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   84.4703s for     8192 events => throughput is 9.70E+01 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.1598s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561701257335E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.549992e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.034210e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.544779e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.028899e+02                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,10 +206,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   28.6856s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5254s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   28.1150s for     8192 events => throughput is 2.91E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0453s
+ [COUNTERS] PROGRAM TOTAL          :   39.0824s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5320s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   38.4775s for     8192 events => throughput is 2.13E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0729s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -213,14 +220,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.581303e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.206988e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.574698e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.236323e+02                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   24.6205s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5315s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   24.0503s for     8192 events => throughput is 3.41E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0387s
+ [COUNTERS] PROGRAM TOTAL          :   36.0617s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5329s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   35.4622s for     8192 events => throughput is 2.31E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0666s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.161373e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.428075e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.184852e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.418588e+02                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,10 +296,10 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.357e-07 [2.3572561705911026E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :   25.7441s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.5280s
- [COUNTERS] CudaCpp MEs      ( 2 ) :   25.1699s for     8192 events => throughput is 3.25E+02 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0462s
+ [COUNTERS] PROGRAM TOTAL          :   33.2889s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.5333s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :   32.6936s for     8192 events => throughput is 2.51E+02 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0621s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -303,14 +310,14 @@ OK! xsec from fortran (2.3572561551282417E-007) and cpp (2.3572561705911026E-007
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.516660e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.614731e+02                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.515216e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.619097e+02                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 1240 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.357e-07 [2.3572561518129465E-007] fbridge_mode=1
+ [XSECTION] Cross section = 2.357e-07 [2.3572561698474940E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 18 events (found 285 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8461s
- [COUNTERS] Fortran Overhead ( 0 ) :    1.0822s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.8795s for     8192 events => throughput is 9.31E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.8844s
+ [COUNTERS] PROGRAM TOTAL          :    4.1792s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.2194s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.6813s for     8192 events => throughput is 4.87E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    1.2785s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561518129465E-007) differ by less than 2E-4 (1.4064212017217415e-09)
+OK! xsec from fortran (2.3572561551282417E-007) and cuda (2.3572561698474940E-007) differ by less than 2E-4 (6.244231132157552e-09)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.415473e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.916582e+03                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.080771e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.946512e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 256 32 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.106752e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.914249e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 512 32 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 256 32 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.156598e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.944163e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.106849e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.911624e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 64 128 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.103409e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.995050e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 --bridge ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111142e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.914631e+03                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 ***
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 1024 8 1 ***
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.667428e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331554e+03                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 7310cfc72a..94f8459e48 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 
 make USEBUILDDIR=1 BACKEND=cppnone
-
-
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:09:42
+DATE: 2025-09-24_09:45:12
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 506 events (found 1943 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5319s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4597s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0722s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5393s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4659s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0734s for     8192 events => throughput is 1.12E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4765s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4047s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0718s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4907s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4175s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0731s for     8192 events => throughput is 1.12E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737132] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4865s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4077s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0781s for     8192 events => throughput is 1.05E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4907s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4099s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0802s for     8192 events => throughput is 1.02E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737132) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073164e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.028696e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.079140e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.033450e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4492s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4062s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0425s for     8192 events => throughput is 1.93E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.4548s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4088s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0454s for     8192 events => throughput is 1.80E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.895347e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.786731e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.917908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.801695e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4396s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4134s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0257s for     8192 events => throughput is 3.19E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4395s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4127s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0262s for     8192 events => throughput is 3.12E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.340027e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.148665e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.307491e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.230434e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4310s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4082s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0224s for     8192 events => throughput is 3.66E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4358s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4102s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0251s for     8192 events => throughput is 3.26E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.693677e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.280342e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.718907e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.447820e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504505737162] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504505737170] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4438s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4093s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0340s for     8192 events => throughput is 2.41E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.4500s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4131s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0364s for     8192 events => throughput is 2.25E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737162) differ by less than 3E-14 (1.7763568394002505e-15)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504505737170) differ by less than 3E-14 (2.220446049250313e-15)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.386493e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.325632e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.395890e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.342134e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737173] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8495s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8451s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.52E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.8555s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8481s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0020s for     8192 events => throughput is 4.19E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0055s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -348,44 +355,44 @@ OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504505737173) diffe
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.777000e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.866764e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.265214e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.652053e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.327919e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.284283e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.161258e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.180927e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.316740e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.268671e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.319766e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.275096e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.323054e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.288694e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.646948e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.760684e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index 748c92b28c..41a9d43802 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -1,7 +1,10 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
@@ -9,33 +12,37 @@ make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:10:11
+DATE: 2025-09-24_09:45:44
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 506 events (found 1943 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5240s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4523s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0718s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5365s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4630s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0736s for     8192 events => throughput is 1.11E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4796s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4074s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0721s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4888s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4156s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0732s for     8192 events => throughput is 1.12E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313506133732837] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313506139857326] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4786s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4057s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0723s for     8192 events => throughput is 1.13E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4913s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4131s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0776s for     8192 events => throughput is 1.06E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506133732837) differ by less than 4E-4 (8.014351782215101e-08)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313506139857326) differ by less than 4E-4 (8.044501620396716e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.132089e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.064305e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.123977e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.070087e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313502997679400] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313502999900010] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4346s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4073s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0269s for     8192 events => throughput is 3.04E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4383s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4094s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0285s for     8192 events => throughput is 2.87E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502997679400) differ by less than 4E-4 (7.423917058879681e-08)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502999900010) differ by less than 4E-4 (7.412985369992242e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.016574e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.827821e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.049161e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.873784e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313502617200768] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4098s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0130s for     8192 events => throughput is 6.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4241s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4102s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0136s for     8192 events => throughput is 6.03E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502617200768) differ by less than 4E-4 (9.296949998738313e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.215183e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.180969e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.201945e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.191931e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313502619857851] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313502617200768] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4177s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4051s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0122s for     8192 events => throughput is 6.69E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4243s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4110s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0130s for     8192 events => throughput is 6.31E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502619857851) differ by less than 4E-4 (9.283869628617936e-08)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313502617200768) differ by less than 4E-4 (9.296949998738313e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.557168e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.453658e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.659565e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.428297e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313505300145301] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313505319061453] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4064s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0163s for     8192 events => throughput is 5.02E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4267s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4090s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0173s for     8192 events => throughput is 4.74E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505300145301) differ by less than 4E-4 (3.910739154733278e-08)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313505319061453) differ by less than 4E-4 (4.003860221146738e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.736521e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.828072e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.799657e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.874410e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313508590887899] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313508403515360] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8496s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8457s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.64E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
+ [COUNTERS] PROGRAM TOTAL          :    0.8572s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8508s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0012s for     8192 events => throughput is 7.00E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508590887899) differ by less than 4E-4 (2.011051698502797e-07)
+OK! xsec from fortran (0.20313504505737126) and cuda (0.20313508403515360) differ by less than 4E-4 (1.9188113165036214e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.049327e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.377483e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.339018e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.198635e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.110522e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.020211e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.423874e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.227757e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.090502e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.018085e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.757351e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.194185e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.720065e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.014234e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.206204e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.542247e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index dd13a39319..5c1471e368 100644
--- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
-make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
+make USEBUILDDIR=1 BACKEND=cuda
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:09:56
+DATE: 2025-09-24_09:45:28
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 506 events (found 1943 events)
- [COUNTERS] PROGRAM TOTAL          :    0.5254s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4537s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0717s for     8192 events => throughput is 1.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5379s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4648s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0731s for     8192 events => throughput is 1.12E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504505737126] fbridge_mode=0
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4842s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4128s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0714s for     8192 events => throughput is 1.15E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4903s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4168s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0735s for     8192 events => throughput is 1.11E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504495344831] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4899s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4117s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0776s for     8192 events => throughput is 1.06E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.4927s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4116s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0804s for     8192 events => throughput is 1.02E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344831) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073352e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.028885e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073996e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032709e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504495344833] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504500016025] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4513s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0421s for     8192 events => throughput is 1.94E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.4656s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4172s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0479s for     8192 events => throughput is 1.71E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504495344833) differ by less than 2E-4 (5.115952106393706e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504500016025) differ by less than 2E-4 (2.816402666638851e-10)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.886911e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.818000e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.898728e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.829372e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4424s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4165s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0255s for     8192 events => throughput is 3.22E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4388s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4127s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0255s for     8192 events => throughput is 3.21E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.243245e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.181258e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.311888e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.266270e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4308s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0217s for     8192 events => throughput is 3.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4377s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4132s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0240s for     8192 events => throughput is 3.42E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.793279e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.489599e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.775522e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.437681e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 0.2031 [0.20313504510700500] fbridge_mode=1
+ [XSECTION] Cross section = 0.2031 [0.20313504510471836] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4486s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4131s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0350s for     8192 events => throughput is 2.34E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.4454s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4099s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0349s for     8192 events => throughput is 2.35E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510700500) differ by less than 2E-4 (2.4433854939331923e-10)
+OK! xsec from fortran (0.20313504505737126) and cpp (0.20313504510471836) differ by less than 2E-4 (2.3308177610203984e-10)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.316706e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.358017e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.334216e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.421035e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 5 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 0.2031 [0.20313504512110778] fbridge_mode=1
  [UNWEIGHT] Wrote 499 events (found 1502 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8511s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8469s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.63E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0011s
+ [COUNTERS] PROGRAM TOTAL          :    0.8540s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8466s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.47E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0056s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -348,44 +355,44 @@ OK! xsec from fortran (0.20313504505737126) and cuda (0.20313504512110778) diffe
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.929266e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.782717e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.319589e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.427822e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.340652e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.290753e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.169068e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.175981e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.326566e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.286909e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.337296e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.267053e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.337938e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.287822e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.656612e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.756338e+06                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
index d2a669114e..9ff238b7e1 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 
-
 make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:54:40
+DATE: 2025-09-24_10:44:59
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 3371 events (found 6399 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9291s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0475s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.0945s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0452s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0493s for     8192 events => throughput is 1.66E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4581s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4105s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0476s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.5080s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4584s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0497s for     8192 events => throughput is 1.65E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755170] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4592s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0501s for     8192 events => throughput is 1.63E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.5057s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4528s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0524s for     8192 events => throughput is 1.56E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755170) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.648377e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.608463e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.642355e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.611218e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4344s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4065s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0275s for     8192 events => throughput is 2.98E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4839s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4535s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0300s for     8192 events => throughput is 2.73E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755183) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.984151e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.771322e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.017550e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.772334e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4261s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4086s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0171s for     8192 events => throughput is 4.80E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4726s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4538s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0183s for     8192 events => throughput is 4.46E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.938014e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.577287e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.942444e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.549926e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4299s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4143s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0153s for     8192 events => throughput is 5.37E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4733s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4559s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0170s for     8192 events => throughput is 4.83E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ b
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.398535e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.674690e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.466636e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.906431e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081479755179] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081479755165] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4391s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4149s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0238s for     8192 events => throughput is 3.45E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4803s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4545s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0254s for     8192 events => throughput is 3.23E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755179) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081479755165) differ by less than 3E-14 (8.881784197001252e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.480162e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.296909e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.526547e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.379324e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081479755192] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081479755196] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8532s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8493s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.62E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8974s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8915s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0014s for     8192 events => throughput is 5.87E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0045s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755192) differ by less than 3E-14 (4.440892098500626e-16)
+OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081479755196) differ by less than 3E-14 (6.661338147750939e-16)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.920216e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.947513e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.457557e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.552068e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.816989e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.263798e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.149758e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.488706e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.802618e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.278324e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.511448e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.756938e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.832166e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.256429e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.514724e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.399736e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
index 483bc4166c..bfe8c62394 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
-make USEBUILDDIR=1 BACKEND=cuda
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:09
+DATE: 2025-09-24_10:45:32
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 3371 events (found 6399 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9638s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9156s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0482s for     8192 events => throughput is 1.70E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.0965s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0472s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0494s for     8192 events => throughput is 1.66E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4563s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4085s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0478s for     8192 events => throughput is 1.71E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4977s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4485s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0492s for     8192 events => throughput is 1.67E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,26 +114,27 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160406825242951] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160406822335140] fbridge_mode=1
  [UNWEIGHT] Wrote 1653 events (found 1658 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4552s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4076s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0473s for     8192 events => throughput is 1.73E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.5040s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4538s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0498s for     8192 events => throughput is 1.64E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406825242951) differ by less than 4E-4 (1.6138103811513815e-05)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160406822335140) differ by less than 4E-4 (1.613795957533526e-05)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 ERROR! events.lhe.cpp.1 and events.lhe.ref.1 differ!
-diff /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20
-7562,7575d7561
-< 4 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00
+diff /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.cpp.1 /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/events.lhe.ref.1 | head -20
+8102,8116d8101
+< 5 1 1E-03 0.1250010E+03 0.7546771E-02 0.1235066E+00
 <          21   -1    0    0  503  502  0.00000000000E+00  0.00000000000E+00  0.71320499473E+02  0.71320499473E+02  0.00000000000E+00 0.  1.
 <          21   -1    0    0  502  503 -0.00000000000E+00 -0.00000000000E+00 -0.54771239790E+02  0.54771239790E+02  0.00000000000E+00 0.  1.
-<           5    1    1    2  501    0  0.50303102232E+02  0.36190119942E+02  0.14973002893E+02  0.63925016162E+02  0.47000000000E+01 0. -1.
-<          -5    1    1    2    0  501 -0.50303102232E+02 -0.36190119942E+02  0.15762567893E+01  0.62166723101E+02  0.47000000000E+01 0. -1.
+<          25    2    1    2    0    0  0.00000000000E+00  0.00000000000E+00  0.16549259682E+02  0.12609173926E+03  0.12500099485E+03 0.  0.
+<           5    1    3    3  501    0  0.50303102232E+02  0.36190119942E+02  0.14973002893E+02  0.63925016162E+02  0.47000000000E+01 0. -1.
+<          -5    1    3    3    0  501 -0.50303102232E+02 -0.36190119942E+02  0.15762567893E+01  0.62166723101E+02  0.47000000000E+01 0. -1.
 < <mgrwt>
 < <rscale>  0 0.12500099E+03</rscale>
 < <asrwt>0</asrwt>
diff --git a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
index b61563e796..ddcc5a005b 100644
--- a/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
-
 make USEBUILDDIR=1 BACKEND=cppsse4
+
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:54:54
+DATE: 2025-09-24_10:45:15
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 3371 events (found 6399 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9594s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.9118s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0475s for     8192 events => throughput is 1.72E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    1.1007s
+ [COUNTERS] Fortran Overhead ( 0 ) :    1.0512s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0495s for     8192 events => throughput is 1.65E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_heftggbb_x1_fortran > /tmp/
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 2.016 [2.0160081479755183] fbridge_mode=0
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4589s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4111s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0478s for     8192 events => throughput is 1.71E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4978s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4481s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0497s for     8192 events => throughput is 1.65E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -100,7 +107,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -108,33 +114,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081964453331] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081963935692] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4600s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4089s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0507s for     8192 events => throughput is 1.62E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.5084s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4555s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0525s for     8192 events => throughput is 1.56E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453331) differ by less than 2E-4 (2.4042469792817656e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081963935692) differ by less than 2E-4 (2.401679322083794e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.539881e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.508236e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.532971e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.510960e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -149,7 +152,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -157,33 +159,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081964453336] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081964477738] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4363s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4080s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0279s for     8192 events => throughput is 2.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4822s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4518s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0300s for     8192 events => throughput is 2.73E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964453336) differ by less than 2E-4 (2.404247001486226e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081964477738) differ by less than 2E-4 (2.4043680380003707e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.824636e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.648349e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.869373e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.666977e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -198,7 +197,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -206,33 +204,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4311s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4138s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0169s for     8192 events => throughput is 4.85E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
+ [COUNTERS] PROGRAM TOTAL          :    0.4719s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4531s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0183s for     8192 events => throughput is 4.48E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.809707e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.449043e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.724204e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.654852e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -247,7 +242,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -255,33 +249,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081962974745] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4252s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4093s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0155s for     8192 events => throughput is 5.28E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4690s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4517s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0169s for     8192 events => throughput is 4.85E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962974745) differ by less than 2E-4 (2.3969127349587893e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.163712e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.928374e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.204514e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.901177e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -296,7 +287,6 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192
 1 ! ICONFIG number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!)
 --------------------
 Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_heftggbb_x1_cudacpp > /tmp/avalassi/output_heftggbb_x1_cudacpp'
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [OPENMPTH] omp_get_max_threads/nproc = 1/4
  [NGOODHEL] ngoodhel/ncomb = 16/16
@@ -304,33 +294,30 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081962970020] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081981450446] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4306s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4058s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0244s for     8192 events => throughput is 3.36E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4788s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4530s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0254s for     8192 events => throughput is 3.23E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081962970020) differ by less than 2E-4 (2.3968893092529697e-08)
+OK! xsec from fortran (2.0160081479755183) and cpp (2.0160081981450446) differ by less than 2E-4 (2.4885577154520888e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.121651e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.246164e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.119023e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.288651e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -352,60 +339,60 @@ DEBUG: MEK processed 8192 events across 4 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 2.016 [2.0160081483021330] fbridge_mode=1
+ [XSECTION] Cross section = 2.016 [2.0160081946290331] fbridge_mode=1
  [UNWEIGHT] Wrote 1652 events (found 1657 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8574s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8536s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.63E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8989s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8930s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0014s for     8192 events => throughput is 5.90E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0046s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081483021330) differ by less than 2E-4 (1.6201062713605552e-10)
+OK! xsec from fortran (2.0160081479755183) and cuda (2.0160081946290331) differ by less than 2E-4 (2.31415309137617e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018963e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.891035e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.363694e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.983431e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.820757e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.267970e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.067644e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.492702e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.797704e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.273530e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.465309e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.770175e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.821262e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.254943e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.503862e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.399759e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
index d3cb91b8cd..885b8f6235 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cppnone
 
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:56:37
+DATE: 2025-09-24_10:47:04
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 902 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6766s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3643s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3123s for     8192 events => throughput is 3.54E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7247s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3637s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3610s for     8192 events => throughput is 3.47E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6640s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3633s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3007s for     8192 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7259s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3722s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3537s for     8192 events => throughput is 3.48E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8505s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3633s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4822s for     8192 events => throughput is 3.30E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0050s
+ [COUNTERS] PROGRAM TOTAL          :    2.8837s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3696s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.5087s for     8192 events => throughput is 3.27E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0053s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.457369e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.344967e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.441555e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.331486e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728610E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6655s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3645s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2984s for     8192 events => throughput is 6.31E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
+ [COUNTERS] PROGRAM TOTAL          :    1.7868s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3714s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.4123s for     8192 events => throughput is 5.80E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0031s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728610E-007) differ by less than 3E-14 (4.440892098500626e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.514132e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.905219e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.544925e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.868019e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9435s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3668s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5751s for     8192 events => throughput is 1.42E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
+ [COUNTERS] PROGRAM TOTAL          :    1.0028s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3733s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6279s for     8192 events => throughput is 1.30E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0017s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728578E-007) differ by less than 3E-14 (0.0)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.460459e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.355383e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.466853e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.352140e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8804s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3647s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5144s for     8192 events => throughput is 1.59E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0014s
+ [COUNTERS] PROGRAM TOTAL          :    0.9343s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3694s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5633s for     8192 events => throughput is 1.45E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728578E-007) differ by less than 3E-14 (0.0)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.641494e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.516928e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.655223e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.520810e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.0440s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3665s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6757s for     8192 events => throughput is 1.21E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0018s
+ [COUNTERS] PROGRAM TOTAL          :    1.0825s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3726s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7080s for     8192 events => throughput is 1.16E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0020s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381610362728588E-007) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381610362728578E-007) differ by less than 3E-14 (0.0)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.221115e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.175579e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.225553e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.172330e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,58 +341,58 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8457s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8061s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0199s for     8192 events => throughput is 4.13E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0198s
+ [COUNTERS] PROGRAM TOTAL          :    0.8868s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8154s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0385s for     8192 events => throughput is 2.13E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0329s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381610362728578E-007) differ by less than 3E-14 (0.0)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.230611e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.122719e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.541816e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.161966e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 4096 32 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.854537e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.749173e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 4096 32 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.229320e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.719568e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.859903e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.749862e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.225591e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.734145e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.850975e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.747981e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.687847e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.787425e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
index 10c15cf9d1..9028aeb504 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
-
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
+
+
 make USEBUILDDIR=1 BACKEND=cppsse4
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:58:07
+DATE: 2025-09-24_10:48:27
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 902 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6755s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3603s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3152s for     8192 events => throughput is 3.54E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7248s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3655s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3593s for     8192 events => throughput is 3.47E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6754s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3698s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3056s for     8192 events => throughput is 3.55E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7319s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3721s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3598s for     8192 events => throughput is 3.47E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381686438954397E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381686572538756E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8067s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3659s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4358s for     8192 events => throughput is 3.36E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.8065s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3706s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.4308s for     8192 events => throughput is 3.37E+03 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0050s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686438954397E-007) differ by less than 4E-4 (9.960018576560259e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686572538756E-007) differ by less than 4E-4 (9.977507651193207e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.485505e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.422260e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.473644e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.438824e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381671483253128E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381671512533574E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.0546s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3688s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6842s for     8192 events => throughput is 1.20E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
+ [COUNTERS] PROGRAM TOTAL          :    1.0936s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3692s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7227s for     8192 events => throughput is 1.13E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0017s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381671483253128E-007) differ by less than 4E-4 (8.001994753481512e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381671512533574E-007) differ by less than 4E-4 (8.005828195933873e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.232148e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.164944e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242719e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.167862e+04                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381672199194947E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6626s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3670s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2947s for     8192 events => throughput is 2.78E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0009s
+ [COUNTERS] PROGRAM TOTAL          :    0.6882s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3714s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3158s for     8192 events => throughput is 2.59E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672199194947E-007) differ by less than 4E-4 (8.095726977686013e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.866680e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.676459e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.814611e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.691626e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381672175647812E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381672199194947E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6345s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3672s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2664s for     8192 events => throughput is 3.07E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.6551s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3702s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.2840s for     8192 events => throughput is 2.88E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381672175647812E-007) differ by less than 4E-4 (8.092644150359263e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381672199194947E-007) differ by less than 4E-4 (8.095726977686013e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.183014e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.977246e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.199503e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.981214e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381686320975603E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381686626552808E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.7045s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3656s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3379s for     8192 events => throughput is 2.42E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0010s
+ [COUNTERS] PROGRAM TOTAL          :    0.7233s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3713s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.3509s for     8192 events => throughput is 2.33E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0012s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381686320975603E-007) differ by less than 4E-4 (9.944572607611946e-07)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381686626552808E-007) differ by less than 4E-4 (9.98457925449614e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.460974e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.378670e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.436294e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.375697e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381711031958629E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381615658692040E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8419s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8049s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0197s for     8192 events => throughput is 4.15E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0172s
+ [COUNTERS] PROGRAM TOTAL          :    0.8824s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8173s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0336s for     8192 events => throughput is 2.44E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0314s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381711031958629E-007) differ by less than 4E-4 (1.3179773188376487e-06)
+OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381615658692040E-007) differ by less than 4E-4 (6.933558260868722e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.233915e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.459531e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.454452e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.481553e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 4096 32 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.300238e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.290559e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 4096 32 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.323216e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.192752e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.294935e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.309301e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.322990e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.053552e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.292471e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.272688e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.654983e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.911013e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
index 9cff3d3d2c..d798df1f36 100644
--- a/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 
 make USEBUILDDIR=1 BACKEND=cuda
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:57:22
+DATE: 2025-09-24_10:47:45
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 1 events (found 902 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6661s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3588s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3072s for     8192 events => throughput is 3.55E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7272s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3656s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3616s for     8192 events => throughput is 3.47E+03 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_smeftggtttt_x1_fortran > /t
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610362728588E-007] fbridge_mode=0
+ [XSECTION] Cross section = 7.638e-07 [7.6381610362728578E-007] fbridge_mode=0
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.6664s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3632s
- [COUNTERS] Fortran MEs      ( 1 ) :    2.3031s for     8192 events => throughput is 3.56E+03 events/s
+ [COUNTERS] PROGRAM TOTAL          :    2.7295s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3713s
+ [COUNTERS] Fortran MEs      ( 1 ) :    2.3582s for     8192 events => throughput is 3.47E+03 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608764955655E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    2.8757s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3651s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    2.5054s for     8192 events => throughput is 3.27E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0052s
+ [COUNTERS] PROGRAM TOTAL          :    2.8976s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3714s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    2.5208s for     8192 events => throughput is 3.25E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0054s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293319738268e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608764955655E-007) differ by less than 2E-4 (2.0918293208715966e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.427512e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.319062e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.426484e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.319549e+03                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608686521600E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.6394s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3662s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    1.2706s for     8192 events => throughput is 6.45E+03 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0027s
+ [COUNTERS] PROGRAM TOTAL          :    1.7267s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3721s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    1.3516s for     8192 events => throughput is 6.06E+03 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0030s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164241365944e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608686521600E-007) differ by less than 2E-4 (2.1945164130343642e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.733385e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.130825e+03                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.780255e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.109980e+03                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.9411s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3649s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5747s for     8192 events => throughput is 1.43E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
+ [COUNTERS] PROGRAM TOTAL          :    0.9912s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3711s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6185s for     8192 events => throughput is 1.32E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0016s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.446717e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.371139e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.473262e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.357048e+04                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8685s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3656s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5015s for     8192 events => throughput is 1.63E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0014s
+ [COUNTERS] PROGRAM TOTAL          :    0.9295s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3711s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.5569s for     8192 events => throughput is 1.47E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0015s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.681650e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.528728e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.668117e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.538029e+04                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 7.638e-07 [7.6381608826200266E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    1.0574s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.3699s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.6857s for     8192 events => throughput is 1.19E+04 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0018s
+ [COUNTERS] PROGRAM TOTAL          :    1.0827s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.3713s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.7094s for     8192 events => throughput is 1.15E+04 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0020s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469379161117e-08)
+OK! xsec from fortran (7.6381610362728578E-007) and cpp (7.6381608826200266E-007) differ by less than 2E-4 (2.0116469157116512e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.232369e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.181329e+04                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.216790e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.179267e+04                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 72 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 7.638e-07 [7.6381610372590318E-007] fbridge_mode=1
+ [XSECTION] Cross section = 7.638e-07 [7.6381608831823612E-007] fbridge_mode=1
  [UNWEIGHT] Wrote 230 events (found 851 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8397s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8000s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0198s for     8192 events => throughput is 4.13E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0198s
+ [COUNTERS] PROGRAM TOTAL          :    0.8907s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8195s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0383s for     8192 events => throughput is 2.14E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0329s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (7.6381610362728588E-007) and cuda (7.6381610372590318E-007) differ by less than 2E-4 (1.2911138824733825e-10)
+OK! xsec from fortran (7.6381610362728578E-007) and cuda (7.6381608831823612E-007) differ by less than 2E-4 (2.0042847492796056e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.219575e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.100571e+05                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.527801e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.161024e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 4096 32 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.836972e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.747749e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX) -p 4096 32 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.176072e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.719794e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.835271e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.751159e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX128THR) -p 1024 128 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.206917e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.733503e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 --bridge ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.823749e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.750316e+05                 )  sec^-1
 
-*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+*** EXECUTE GCHECK(MAX8THR) -p 16384 8 1 ***
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.671807e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.790163e+05                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
index f18eaf3551..d3447e0b2c 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:56
+DATE: 2025-09-24_10:46:21
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1732 events (found 4297 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6925s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6838s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.43E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.7006s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6920s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.44E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4263s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4177s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.62E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4289s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4203s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.51E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4276s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4188s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0084s for     8192 events => throughput is 9.73E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4384s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4279s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0102s for     8192 events => throughput is 8.05E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.916439e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.517606e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.017065e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.556272e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4290s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4241s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0046s for     8192 events => throughput is 1.79E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4412s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4354s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for     8192 events => throughput is 1.50E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426120) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.913729e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.736519e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928329e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.804687e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4231s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4198s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.80E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4304s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4268s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.57E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.118646e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.871535e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.327279e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.044911e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,10 +251,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4242s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4212s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.07E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4292s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4257s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.63E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.142389e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.039759e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.418661e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.248707e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426114] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4276s
+ [COUNTERS] PROGRAM TOTAL          :    0.4281s
  [COUNTERS] Fortran Overhead ( 0 ) :    0.4241s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.67E+06 events/s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0036s for     8192 events => throughput is 2.27E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449452343426114) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.810680e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.586266e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.123505e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.787533e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -334,10 +341,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426109] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8704s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8668s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.8640s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8595s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.02E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0037s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -348,44 +355,44 @@ OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452343426109) diffe
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.094441e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.327611e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.576690e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.877091e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.540792e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.655959e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.885377e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.354113e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.486109e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.658621e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.914518e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.391364e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.512059e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.658194e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.224875e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.400250e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
index 9cee2ab297..c88c021ccd 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 make USEBUILDDIR=1 BACKEND=cuda
 
-make USEBUILDDIR=1 BACKEND=cppnone
 
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:56:23
+DATE: 2025-09-24_10:46:50
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,9 +65,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1732 events (found 4297 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6965s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6879s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.48E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.6976s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6890s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.49E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4263s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4177s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0087s for     8192 events => throughput is 9.43E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4300s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4213s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.49E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,9 +116,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449446496609361] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4268s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4183s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0082s for     8192 events => throughput is 9.95E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4375s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4271s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0100s for     8192 events => throughput is 8.16E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446496609361) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.006620e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.685035e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.012762e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.581417e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,9 +161,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449446369440458] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4190s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4159s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0028s for     8192 events => throughput is 2.94E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4301s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4266s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0033s for     8192 events => throughput is 2.49E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446369440458) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.282555e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.779402e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.369793e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.773921e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449446595743795] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4206s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4183s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0020s for     8192 events => throughput is 4.08E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
+ [COUNTERS] PROGRAM TOTAL          :    0.4278s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4257s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.50E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07)
+OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446595743795) differ by less than 4E-4 (1.8876143514923172e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.872977e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.742333e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.148892e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.686508e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449446614968528] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449446595743795] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4201s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4180s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.44E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
+ [COUNTERS] PROGRAM TOTAL          :    0.4283s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4262s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0018s for     8192 events => throughput is 4.56E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446614968528) differ by less than 4E-4 (1.881300697448296e-07)
+OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446595743795) differ by less than 4E-4 (1.8876143514923172e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.886846e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.187003e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.506416e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.926848e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449447031649013] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449446997188218] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4202s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4176s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0023s for     8192 events => throughput is 3.54E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4281s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4258s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0020s for     8192 events => throughput is 4.09E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cpp (0.30449447031649013) differ by less than 4E-4 (1.744457354124762e-07)
+OK! xsec from fortran (0.30449452343426120) and cpp (0.30449446997188218) differ by less than 4E-4 (1.7557747322705097e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.376595e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.995899e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.863933e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.536276e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449447352014630] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449447100896687] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8576s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8540s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.64E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.8663s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8620s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0007s for     8192 events => throughput is 1.17E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0036s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447352014630) differ by less than 4E-4 (1.639245078566276e-07)
+OK! xsec from fortran (0.30449452343426120) and cuda (0.30449447100896687) differ by less than 4E-4 (1.7217155090509806e-07)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.209039e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.950959e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.497762e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.932846e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.599688e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.122837e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.103544e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.713910e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.606706e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.139014e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.131283e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.219680e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.229812e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.999475e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.664371e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.855116e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
index 782fee34a5..fb8e1884d6 100644
--- a/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
@@ -1,4 +1,7 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 
 make USEBUILDDIR=1 BACKEND=cuda
@@ -6,36 +9,40 @@ make USEBUILDDIR=1 BACKEND=cuda
 make USEBUILDDIR=1 BACKEND=cppnone
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make USEBUILDDIR=1 BACKEND=cppavx2
 
+make USEBUILDDIR=1 BACKEND=cppavx2
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:56:09
+DATE: 2025-09-24_10:46:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -58,8 +65,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1732 events (found 4297 events)
- [COUNTERS] PROGRAM TOTAL          :    0.6953s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.6866s
+ [COUNTERS] PROGRAM TOTAL          :    0.6987s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.6901s
  [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.49E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
@@ -83,9 +90,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggt1t1_x1_fortran > /tm
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449452343426120] fbridge_mode=0
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4236s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4152s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0085s for     8192 events => throughput is 9.68E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4267s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4180s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0086s for     8192 events => throughput is 9.48E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,10 +116,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4277s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4187s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.54E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4398s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4292s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0103s for     8192 events => throughput is 7.98E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -123,14 +130,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.831908e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.422189e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.918457e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.535671e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,10 +161,10 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453160892032] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4287s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4239s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0045s for     8192 events => throughput is 1.83E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4393s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4334s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0055s for     8192 events => throughput is 1.49E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
@@ -168,14 +175,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453160892032) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.892977e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.767119e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.974211e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.859923e+06                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,9 +206,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4216s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4183s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0029s for     8192 events => throughput is 2.84E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4295s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4260s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -213,14 +220,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.237521e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.984842e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.477152e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.183434e+06                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,9 +251,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4270s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4239s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0027s for     8192 events => throughput is 3.00E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4304s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4270s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.75E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -258,14 +265,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.311234e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.997535e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.507028e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.294958e+06                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,9 +296,9 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] ChannelId = 3
  [XSECTION] Cross section = 0.3045 [0.30449453255288433] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4247s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4212s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0032s for     8192 events => throughput is 2.59E+06 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4289s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4249s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0035s for     8192 events => throughput is 2.33E+06 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
@@ -303,14 +310,14 @@ OK! xsec from fortran (0.30449452343426120) and cpp (0.30449453255288433) differ
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.926715e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.666307e+06                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.198931e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.789154e+06                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 6 channels { 3 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 2
  [XSECTION] ChannelId = 3
- [XSECTION] Cross section = 0.3045 [0.30449452360186230] fbridge_mode=1
+ [XSECTION] Cross section = 0.3045 [0.30449453200208287] fbridge_mode=1
  [UNWEIGHT] Wrote 1612 events (found 1617 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8627s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8591s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0005s
+ [COUNTERS] PROGRAM TOTAL          :    0.8609s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8564s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0008s for     8192 events => throughput is 1.05E+07 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0037s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (0.30449452343426120) and cuda (0.30449452360186230) differ by less than 2E-4 (5.504239286580059e-10)
+OK! xsec from fortran (0.30449452343426120) and cuda (0.30449453200208287) differ by less than 2E-4 (2.813785138222613e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.206349e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.339486e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.536038e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.858130e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.506637e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.655045e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.900315e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.353515e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.486873e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.662294e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.921916e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.390355e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.466467e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.659752e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.235205e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.405982e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
index bebebe43ae..1a5ebcb708 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
-make USEBUILDDIR=1 BACKEND=cuda
-
-make USEBUILDDIR=1 BACKEND=cppnone
 
+make USEBUILDDIR=1 BACKEND=cuda
 
 
+make USEBUILDDIR=1 BACKEND=cppnone
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:13
+DATE: 2025-09-24_10:45:36
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 2625 events (found 5368 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8496s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8074s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8571s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8136s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0436s for     8192 events => throughput is 1.88E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4529s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4105s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0424s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4584s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4143s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0441s for     8192 events => throughput is 1.86E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846964] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4621s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4167s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0450s for     8192 events => throughput is 1.82E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4712s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4239s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0468s for     8192 events => throughput is 1.75E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846964) differ by less than 3E-14 (2.220446049250313e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846964) differ by less than 3E-14 (4.440892098500626e-16)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.859940e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.795171e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.839978e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.795530e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4403s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4153s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0246s for     8192 events => throughput is 3.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4493s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4216s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0273s for     8192 events => throughput is 3.00E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846957) differ by less than 3E-14 (0.0)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.243144e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.063303e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.273347e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.061071e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4324s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4161s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0159s for     8192 events => throughput is 5.14E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4397s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4227s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0166s for     8192 events => throughput is 4.94E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.210364e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.021968e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.310117e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.062640e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4350s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4201s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0146s for     8192 events => throughput is 5.62E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4387s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4225s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0158s for     8192 events => throughput is 5.19E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846957) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.704117e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.201601e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.793092e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.417673e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -289,28 +296,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4434s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4195s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0234s for     8192 events => throughput is 3.49E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4470s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4222s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0245s for     8192 events => throughput is 3.35E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641911695846950) differ by less than 3E-14 (2.220446049250313e-16)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.552376e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.432574e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.639783e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.453192e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846950] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8650s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8612s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0031s for     8192 events => throughput is 2.66E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0008s
+ [COUNTERS] PROGRAM TOTAL          :    0.8664s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8608s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0013s for     8192 events => throughput is 6.36E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0043s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cuda (44.641911695846950) differ by less than 3E-14 (1.1102230246251565e-16)
+OK! xsec from fortran (44.641911695846943) and cuda (44.641911695846943) differ by less than 3E-14 (0.0)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.043338e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.914793e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.325784e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.001985e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.871559e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.910414e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.143094e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.037284e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865333e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.909403e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 8.020534e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.073169e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.868423e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.916674e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.708181e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.623185e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
index 2a76a737ac..8da465d5f4 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 make USEBUILDDIR=1 BACKEND=cppnone
 
-
 make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:42
+DATE: 2025-09-24_10:46:07
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 2625 events (found 5368 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8397s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.7971s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0425s for     8192 events => throughput is 1.93E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8588s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8149s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0439s for     8192 events => throughput is 1.86E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4553s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4122s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0430s for     8192 events => throughput is 1.90E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4581s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4141s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0440s for     8192 events => throughput is 1.86E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -107,30 +114,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641906072918047] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641906076692848] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4624s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4193s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0429s for     8192 events => throughput is 1.91E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4686s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4240s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0443s for     8192 events => throughput is 1.85E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641906072918047) differ by less than 4E-4 (1.2595627507661078e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641906076692848) differ by less than 4E-4 (1.2587171749345316e-07)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.972969e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.876039e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.987350e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.883693e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -152,30 +159,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641902189470080] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641902218109820] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4356s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4182s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0171s for     8192 events => throughput is 4.78E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4450s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4258s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0190s for     8192 events => throughput is 4.31E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641902189470080) differ by less than 4E-4 (2.1294735186305758e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641902218109820) differ by less than 4E-4 (2.123058078229434e-07)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.748983e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.348521e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.695429e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.342860e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -197,30 +204,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641902360162746] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4310s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4214s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0093s for     8192 events => throughput is 8.83E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
+ [COUNTERS] PROGRAM TOTAL          :    0.4321s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4230s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0089s for     8192 events => throughput is 9.20E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641902360162746) differ by less than 4E-4 (2.0912375486847878e-07)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.169652e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.500870e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.239468e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.292903e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -242,30 +249,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641902360436738] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641902360162746] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4221s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4132s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0086s for     8192 events => throughput is 9.50E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0002s
+ [COUNTERS] PROGRAM TOTAL          :    0.4313s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4223s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0088s for     8192 events => throughput is 9.30E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641902360436738) differ by less than 4E-4 (2.0911761755559866e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641902360162746) differ by less than 4E-4 (2.0912375486847878e-07)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.627165e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.427731e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 9.935546e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.637478e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641906399820272] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641906412232359] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4293s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4169s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0121s for     8192 events => throughput is 6.75E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4333s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4215s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0115s for     8192 events => throughput is 7.12E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641906399820272) differ by less than 4E-4 (1.1863351012664225e-07)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641906412232359) differ by less than 4E-4 (1.1835547319982709e-07)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.774461e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.400687e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 6.994273e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.434626e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641910992291372] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641910985805701] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8577s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8540s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0006s
+ [COUNTERS] PROGRAM TOTAL          :    0.8631s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8579s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0010s for     8192 events => throughput is 8.09E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0042s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cuda (44.641910992291372) differ by less than 4E-4 (1.575997887748315e-08)
+OK! xsec from fortran (44.641911695846943) and cuda (44.641910985805701) differ by less than 4E-4 (1.5905260664084153e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.201092e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.390475e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.452650e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.833171e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.883185e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.588623e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.341479e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.095229e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.843740e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.590889e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.360831e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.068812e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 4.608054e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.535879e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.014740e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.379682e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
index 449e459bdc..de2afdefc1 100644
--- a/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tmad/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
@@ -1,41 +1,48 @@
-Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+HASBLAS=hasBlas
+Working directory (build): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 make USEBUILDDIR=1 BACKEND=cuda
 
 
-
 make USEBUILDDIR=1 BACKEND=cppnone
-make USEBUILDDIR=1 BACKEND=cppsse4
 
+make USEBUILDDIR=1 BACKEND=cppsse4
 make USEBUILDDIR=1 BACKEND=cppavx2
+
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+CUDACPP_RUNTIME_BLASCOLORSUM=
+
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 
 OMP_NUM_THREADS=
 
-DATE: 2024-10-06_10:55:27
+DATE: 2025-09-24_10:45:51
 
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
-Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+Working directory (run): /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 
 *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) ***
 --------------------
@@ -56,11 +63,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 2625 events (found 5368 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8469s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8046s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0423s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.8596s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8158s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0438s for     8192 events => throughput is 1.87E+05 events/s
 
 *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) ***
 --------------------
@@ -81,11 +88,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_susyggtt_x1_fortran > /tmp/
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911695846957] fbridge_mode=0
+ [XSECTION] Cross section = 44.64 [44.641911695846943] fbridge_mode=0
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4519s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4098s
- [COUNTERS] Fortran MEs      ( 1 ) :    0.0422s for     8192 events => throughput is 1.94E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4581s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4140s
+ [COUNTERS] Fortran MEs      ( 1 ) :    0.0441s for     8192 events => throughput is 1.86E+05 events/s
 
 *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -109,28 +116,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4618s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4159s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0455s for     8192 events => throughput is 1.80E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4714s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4239s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0471s for     8192 events => throughput is 1.74E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08)
 
 *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.833802e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.775167e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 1.834236e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.783124e+05                 )  sec^-1
 
 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -154,28 +161,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912938404218] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4439s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4188s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0248s for     8192 events => throughput is 3.30E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4504s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4238s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0262s for     8192 events => throughput is 3.13E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912938404218) differ by less than 2E-4 (2.783387209603916e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912938404218) differ by less than 2E-4 (2.7833872318083763e-08)
 
 *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.367073e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.106214e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.340820e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.103998e+05                 )  sec^-1
 
 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -199,28 +206,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4302s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4144s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0154s for     8192 events => throughput is 5.33E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4398s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4230s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0165s for     8192 events => throughput is 4.97E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08)
 
 *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.283261e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.205244e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.353744e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.190124e+05                 )  sec^-1
 
 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -244,28 +251,28 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] ChannelId = 1
  [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4297s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4151s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0143s for     8192 events => throughput is 5.75E+05 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0003s
+ [COUNTERS] PROGRAM TOTAL          :    0.4387s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4231s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0152s for     8192 events => throughput is 5.39E+05 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104280711253e-08)
 
 *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.825518e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.454547e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 5.928231e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.572443e+05                 )  sec^-1
 
 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) ***
 --------------------
@@ -287,30 +294,30 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641912970378179] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641912970378172] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.4383s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.4159s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0220s for     8192 events => throughput is 3.73E+05 events/s
+ [COUNTERS] PROGRAM TOTAL          :    0.4487s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.4246s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0237s for     8192 events => throughput is 3.45E+05 events/s
  [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0004s
 
 *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cpp (44.641912970378179) differ by less than 2E-4 (2.8550104058666648e-08)
+OK! xsec from fortran (44.641911695846943) and cpp (44.641912970378172) differ by less than 2E-4 (2.8550104058666648e-08)
 
 *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE CHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.615578e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.539675e+05                 )  sec^-1
 
 *** EXECUTE CHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.732261e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.512456e+05                 )  sec^-1
 
 *** (3-cuda) EXECUTE MADEVENT_CUDA x1 (create events.lhe) ***
 --------------------
@@ -332,60 +339,60 @@ DEBUG: MEK processed 8192 events across 3 channels { 1 : 8192 }
  [XSECTION] MultiChannel = TRUE
  [XSECTION] Configuration = 1
  [XSECTION] ChannelId = 1
- [XSECTION] Cross section = 44.64 [44.641911674225568] fbridge_mode=1
+ [XSECTION] Cross section = 44.64 [44.641912968724782] fbridge_mode=1
  [UNWEIGHT] Wrote 1617 events (found 1622 events)
- [COUNTERS] PROGRAM TOTAL          :    0.8598s
- [COUNTERS] Fortran Overhead ( 0 ) :    0.8560s
- [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0030s for     8192 events => throughput is 2.69E+06 events/s
- [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0007s
+ [COUNTERS] PROGRAM TOTAL          :    0.8642s
+ [COUNTERS] Fortran Overhead ( 0 ) :    0.8585s
+ [COUNTERS] CudaCpp MEs      ( 2 ) :    0.0013s for     8192 events => throughput is 6.40E+06 events/s
+ [COUNTERS] CudaCpp HEL      ( 3 ) :    0.0044s
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec ***
 
-OK! xsec from fortran (44.641911695846957) and cuda (44.641911674225568) differ by less than 2E-4 (4.843293543999039e-10)
+OK! xsec from fortran (44.641911695846943) and cuda (44.641912968724782) differ by less than 2E-4 (2.8513067462654362e-08)
 
 *** (3-cuda) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) ***
 
 OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.907482e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.967399e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(8192) -p 256 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 3.361691e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032617e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.875077e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.932611e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX) -p 16384 32 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.567905e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.037431e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.865156e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.923206e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 7.911973e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.073184e+07                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.881287e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.923583e+06                 )  sec^-1
 
 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 ***
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-EvtsPerSec[MECalcOnly] (3a) = ( 2.733673e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.621614e+07                 )  sec^-1
 
 *** (3-hip) WARNING! SKIP MADEVENT_HIP (hip is not supported on this node) ***
 
diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh
index 56fbce5d92..cc0039f545 100755
--- a/epochX/cudacpp/tmad/madX.sh
+++ b/epochX/cudacpp/tmad/madX.sh
@@ -251,11 +251,11 @@ function getgridmax()
   elif [ "${ggtt}" == "1" ]; then 
     echo 16384 32 # same total grid dimension as 2048 256
   elif [ "${ggttg}" == "1" ]; then
-    echo 16384 32 # same total grid dimension as 2048 256
+    echo 8192 32 # same total grid dimension as 1024 256 (new sep2025)
   elif [ "${ggttgg}" == "1" ]; then
-    echo 16384 32 # same total grid dimension as 2048 256
+    echo 512 32 # same total grid dimension as 64 256 (new sep2025: even 1024/32 aborts in max8thr mode)
   elif [ "${ggttggg}" == "1" ]; then
-    echo 512 32 # same total grid dimension as 64 256
+    echo 256 32 # same total grid dimension as 32 256 (new sep2025)
   elif [ "${gguu}" == "1" ]; then
     echo 16384 32 # same total grid dimension as 2048 256
   elif [ "${gqttq}" == "1" ]; then
@@ -267,7 +267,7 @@ function getgridmax()
   elif [ "${susyggt1t1}" == "1" ]; then
     echo 16384 32 # same total grid dimension as 2048 256
   elif [ "${smeftggtttt}" == "1" ]; then
-    echo 16384 32 # same total grid dimension as 2048 256
+    echo 4096 32 # same total grid dimension as 512 256 (new sep2025)
   else
     echo "ERROR! Unknown process" > /dev/stderr; usage
   fi
@@ -478,9 +478,15 @@ function runmadevent()
 # PART 1 - build madevent
 ##########################################################################
 
+echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE}
+echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE}
+
 unset GTEST_ROOT
 unset LOCALGTEST
 
+export HASBLAS=hasBlas
+echo HASBLAS=${HASBLAS}
+
 for suff in $suffs; do
 
   dir=$(showdir)
@@ -511,6 +517,12 @@ if [ "${maketype}" == "-makeonly" ]; then printf "\nMAKE COMPLETED\n"; exit 0; f
 # PART 2 - run madevent
 ##########################################################################
 
+unset CUDACPP_RUNTIME_BLASCOLORSUM
+printf "\nCUDACPP_RUNTIME_BLASCOLORSUM=$CUDACPP_RUNTIME_BLASCOLORSUM\n"
+
+unset CUDACPP_RUNTIME_CUBLASTF32TENSOR
+printf "\nCUDACPP_RUNTIME_CUBLASTF32TENSOR=$CUDACPP_RUNTIME_CUBLASTF32TENSOR\n"
+
 printf "\nOMP_NUM_THREADS=$OMP_NUM_THREADS\n"
 
 printf "\nDATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n"
diff --git a/epochX/cudacpp/tmad/strip10x.sh b/epochX/cudacpp/tmad/strip10x.sh
new file mode 100755
index 0000000000..571d134a64
--- /dev/null
+++ b/epochX/cudacpp/tmad/strip10x.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# Copyright (C) 2020-2025 CERN and UCLouvain.
+# Licensed under the GNU Lesser General Public License (version 3 or later).
+# Created by: A. Valassi (Sep 2025) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2025) for the MG5aMC CUDACPP plugin.
+
+cd $(dirname $0)
+for log in logs_*/log*.txt ; do
+  cat $log | awk 'BEGIN{ok=1}; /^\*\*\*/{if ($5=="x10") ok=0; else ok=1}; {if (ok==1) print $0}' > ${log}.new
+  mv ${log}.new ${log}
+done
diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh
index 69ef153764..28d282be16 100755
--- a/epochX/cudacpp/tput/allTees.sh
+++ b/epochX/cudacpp/tput/allTees.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Apr 2022) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2022-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2022-2025) for the MG5aMC CUDACPP plugin.
 
 scrdir=$(cd $(dirname $0); pwd)
 
@@ -20,7 +20,7 @@ if [ "$(hostname)" == "itgold91.cern.ch" ]; then bblds=-cpponly; fi
 # Usage
 function usage()
 {
-  echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm]"
+  echo "Usage (1): $0 [-short] [-e] [-sa] [-makeonly] [-nomakeclean] [-hip|-nocuda|-cpponly] [-bsmonly|-nobsm|-scalingonly|-blasonly|-blasandscalingonly]"
   echo "Run tests and check all logs"
   echo ""
   echo "Usage (2): $0 -checkonly"
@@ -32,7 +32,10 @@ function usage()
 checkonly=0
 ggttggg=-ggttggg
 rndhst=-curhst
-bsm=
+sm=1
+bsm=1
+scaling=1
+blas=1
 if [ "$1" == "-checkonly" ]; then
   # Check existing logs without running any tests?
   checkonly=1
@@ -73,11 +76,35 @@ while [ "${checkonly}" == "0" ] && [ "$1" != "" ]; do
     if [ "${bblds}" != "" ] && [ "${bblds}" != "$1" ]; then echo "ERROR! Incompatible option $1: backend builds are already defined as '$bblds'"; usage; fi
     bblds="$1"
     shift
-  elif [ "$1" == "-bsmonly" ] && [ "$bsm" != "-nobsm" ]; then
-    bsm=$1
+  elif [ "$1" == "-bsmonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=1
+    scaling=0
+    blas=0
     shift
-  elif [ "$1" == "-nobsm" ] && [ "$bsm" != "-bsmonly" ]; then
-    bsm=$1
+  elif [ "$1" == "-nobsm" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=1
+    bsm=0
+    scaling=1
+    blas=1
+    shift
+  elif [ "$1" == "-scalingonly" ] && [ "${sm}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=0
+    scaling=1
+    blas=0
+    shift
+  elif [ "$1" == "-blasonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=0
+    scaling=0
+    blas=1
+    shift
+  elif [ "$1" == "-blasandscalingonly" ] && [ "${blas}${scaling}${bsm}${blas}" == "1111" ]; then
+    sm=0
+    bsm=0
+    scaling=1
+    blas=1
     shift
   else
     usage
@@ -88,11 +115,28 @@ done
 function checklogs()
 {
   cd $scrdir/..
-  # Print out any errors in the logs
-  if ! egrep -i '(error|fault|failed)' ./tput/logs_* -r; then echo "No errors found in logs"; fi
+  # Print out any errors in the logs (exclude scaling logs)
+  if ! egrep -i '(error|fault|failed)' ./tput/logs_*/*.txt; then echo "No errors found in logs"; fi
   # Print out any FPEs or '{ }' in the logs
   echo
   if ! egrep '(^Floating Point Exception|{ })' tput/logs* -r; then echo "No FPEs or '{ }' found in logs"; fi
+  # Print out any aborts in the logs (exclude scaling logs)
+  echo
+  txt=$(grep Abort ./tput/logs_*/*.txt | sed "s|\:.*SubProcesses/P|: P|")
+  if [ "${txt}" == "" ]; then
+    echo "No aborts found in logs"
+  else
+    echo "${txt}"
+  fi
+  # Print out any asserts/aborts in scaling logs
+  echo
+  txt=$(egrep -i '(abort|assert)' ./tput/logs_*/*.scaling | sed "s|\:.*SubProcesses/P|: P|" | sort -u)
+  if [ "${txt}" == "" ]; then
+    echo "No aborts or asserts found in scaling logs"
+  else
+    echo "${txt}"
+  fi
+
   # Print out the MEK channelid debugging output (except for '{ }')
   echo
   \grep MEK ${scrdir}/logs_*/* | sed "s|${scrdir}/logs_||" | grep -v '{ }' | sed 's|_mad.*DEBUG:||' | sort -u
@@ -123,11 +167,11 @@ fi
 cd $scrdir/..
 started="STARTED  AT $(date)"
 
-# (36/102) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes
+# (+36: 36/138) Six logs (double/mixed/float x hrd0/hrd1 x inl0) in each of the six SM processes [sm==1]
 \rm -rf gg_ttggg${suff}/lib/build.none_*
 cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg ${makeclean} ${opts}"
 tmp1=$(mktemp)
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
   ls -ltr ee_mumu${suff}/lib/build.none_*_inl0_hrd* gg_tt${suff}/lib/build.none_*_inl0_hrd* gg_tt*g${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp1
 else
@@ -135,86 +179,140 @@ else
 fi
 ended1="$cmd\nENDED(1) AT $(date) [Status=$status]"
 
-# (48/102) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes
+# (+18: 54/138) Three scaling logs (double/mixed/float x hrd0 x inl0) in each of the six SM processes [scaling==1]
+if [ "${scaling}" == "1" ]; then
+  if [ "${sm}" == "1" ]; then
+    cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${opts}" # no rebuild needed
+    $cmd; status=$?
+  else
+    cmd="./tput/teeThroughputX.sh -dmf -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -scaling ${makeclean} ${opts}" # this is the first build
+    $cmd; status=$?
+  fi
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended1sc="$cmd\nENDED(1-scaling) AT $(date) [Status=$status]"
+
+# (+6: 60/138) Three extra logs (double/float x hrd0 x inl0 + blasOn) only in two of the six SM processes (rebuild may be needed) [blas==1]
+if [ "${blas}" == "1" ]; then
+  if [ "${sm}" == "1" ] || [ "${scaling}" == "1" ]; then
+    cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${opts}" # no rebuild needed
+    $cmd; status=$?
+  else
+    cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn ${makeclean} ${opts}" # this is the first build
+    $cmd; status=$?
+  fi
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended2="$cmd\nENDED(2) AT $(date) [Status=$status]"
+
+# (+6: 66/138) Three scaling logs (double/float x hrd0 x inl0 + blasOn) only in two of the six SM processes [blas==1 || scaling==1]
+if [ "${blas}" == "1" ] || [ "${scaling}" == "1" ]; then
+  cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -blasOn -scaling ${opts}" # no rebuild needed
+  $cmd; status=$?
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended2sc="$cmd\nENDED(2-scaling) AT $(date) [Status=$status]"
+
+# (+12: 78/138) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six SM processes [sm==1]
 \rm -rf gg_ttg${suff}/lib/build.none_*
 \rm -rf gg_ttggg${suff}/lib/build.none_*
 cmd="./tput/teeThroughputX.sh -d_f -hrd -makej -eemumu -ggtt -ggttgg -inlonly ${makeclean} ${opts}"
-tmp2=$(mktemp)
-if [ "${bsm}" != "-bsmonly" ]; then
+tmp3=$(mktemp)
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
-  ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2
+  ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp3
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended2="$cmd\nENDED(2) AT $(date) [Status=$status]"
+ended3="$cmd\nENDED(3) AT $(date) [Status=$status]"
 
-# (60/102) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache)
+# (+12: 90/138) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six SM processes (rebuild from cache) [sm==1]
 cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -d_f -bridge ${makeclean} ${opts}"
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended3="$cmd\nENDED(3) AT $(date) [Status=$status]"
+ended4="$cmd\nENDED(4) AT $(date) [Status=$status]"
 
-# (66/102) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed)
+# (+6: 96/138) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six SM processes (no rebuild needed) [sm==1]
 cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -rmbhst ${opts}"
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended4="$cmd\nENDED(4) AT $(date) [Status=$status]"
+ended5="$cmd\nENDED(5) AT $(date) [Status=$status]"
 
-# (72/102) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed)
+# (+6: 102/138) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six SM processes (no rebuild needed) [sm==1]
 cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f ${rndhst} ${opts}"
-if [ "${bsm}" != "-bsmonly" ] && [ "${rndhst}" != "-common" ]; then
+if [ "${sm}" == "1" ] && [ "${rndhst}" != "-common" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended5="$cmd\nENDED(5) AT $(date) [Status=$status]"
+ended6="$cmd\nENDED(6) AT $(date) [Status=$status]"
 
-# (78/102) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed)
+# (+6: 108/138) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six SM processes (no rebuild needed) [sm==1]
 cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -d_f -common ${opts}"
-if [ "${bsm}" != "-bsmonly" ]; then
+if [ "${sm}" == "1" ]; then
   $cmd; status=$?
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended6="$cmd\nENDED(6) AT $(date) [Status=$status]"
+ended7="$cmd\nENDED(7) AT $(date) [Status=$status]"
 
-# (102/102) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes
+# (+6: 114/138) Three extra logs (double/float x hrd0 x inl0 + noBlas) only in two of the six SM processes (rebuild is needed) [blas==1]
+cmd="./tput/teeThroughputX.sh -ggtt -ggttgg -dmf -noBlas ${makeclean} ${opts}"
+if [ "${blas}" == "1" ]; then
+  $cmd; status=$?
+else
+  cmd="SKIP '$cmd'"; echo $cmd; status=$?
+fi
+ended8="$cmd\nENDED(8) AT $(date) [Status=$status]"
+
+# (+24: 138/138) Six extra logs (double/mixed/float x hrd0/hrd1 x inl0) only in the four BSM processes [bsm==1]
 cmd="./tput/teeThroughputX.sh -dmf -hrd -makej -susyggtt -susyggt1t1 -smeftggtttt -heftggbb ${makeclean} ${opts}"
-tmp3=$(mktemp)
-if [ "${bsm}" != "-nobsm" ]; then
+tmp9=$(mktemp)
+if [ "${bsm}" == "1" ]; then
   $cmd; status=$?
-  ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2
+  ls -ltr susy_gg_tt${suff}/lib/build.none_*_inl0_hrd* susy_gg_t1t1${suff}/lib/build.none_*_inl0_hrd* smeft_gg_tttt${suff}/lib/build.none_*_inl0_hrd* heft_gg_bb${suff}/lib/build.none_*_inl0_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp9
 else
   cmd="SKIP '$cmd'"; echo $cmd; status=$?
 fi
-ended7="$cmd\nENDED(7) AT $(date) [Status=$status]"
+ended9="$cmd\nENDED(9) AT $(date) [Status=$status]"
 
 echo
 echo "Build(1):"
 cat $tmp1
 echo
-echo "Build(2):"
-cat $tmp2
+echo "Build(3):"
+cat $tmp3
+echo
+echo "Build(9):"
+cat $tmp9
 echo
 echo -e "$started"
 echo -e "$ended1"
+echo -e "$ended1sc"
 echo -e "$ended2"
+echo -e "$ended2sc"
 echo -e "$ended3"
 echo -e "$ended4"
 echo -e "$ended5"
 echo -e "$ended6"
 echo -e "$ended7"
+echo -e "$ended8"
+echo -e "$ended9"
 
 if [ "$ggttggg" == "" ]; then
   echo
   echo "To complete the test for ggttggg type:"
   echo "  ./tput/teeThroughputX.sh -dmf -hrd -makej -ggttggg ${makeclean} ${opts}"
+  echo "  ./tput/teeThroughputX.sh -dmf -makej -ggttggg -scaling ${makeclean} ${opts}"
   echo "  ./tput/teeThroughputX.sh -makej -ggttggg -d_f -bridge ${makeclean} ${opts}"
 fi
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..f5d01d4a4b
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+DATE: 2025-09-24_08:15:51
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.055845e+06    1 256
+3.753858e+06    2 256
+7.335611e+06    4 256
+1.381152e+07    8 256
+2.631089e+07   16 256
+4.227169e+07   32 256
+4.374783e+07   64 256
+4.667347e+07  128 256
+5.300001e+07  256 256
+5.625799e+07  512 256
+5.575404e+07 1024 256
+### GPU: scaling test 32
+2.730935e+05    1  32
+5.353588e+05    2  32
+1.179560e+06    4  32
+2.202511e+06    8  32
+3.971147e+06   16  32
+8.376963e+06   32  32
+1.455166e+07   64  32
+2.952242e+07  128  32
+4.540844e+07  256  32
+4.671586e+07  512  32
+4.971688e+07 1024  32
+5.165973e+07 2048  32
+5.386422e+07 4096  32
+5.487265e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.243679e+05    1 256
+9.852786e+05    2 256
+1.021348e+06    4 256
+### CPU: scaling test 32
+1.016809e+06    1  32
+8.445055e+05    2  32
+9.845700e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.489316e+06    1 256
+1.586913e+06    2 256
+1.430959e+06    4 256
+### CPU: scaling test 32
+1.667709e+06    1  32
+1.336229e+06    2  32
+1.494088e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.628634e+06    1 256
+2.644232e+06    2 256
+2.663047e+06    4 256
+### CPU: scaling test 32
+2.356407e+06    1  32
+2.491921e+06    2  32
+2.561998e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.011800e+06    1 256
+3.045534e+06    2 256
+2.739301e+06    4 256
+### CPU: scaling test 32
+1.316222e+06    1  32
+1.911018e+06    2  32
+2.740313e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.926826e+06    1 256
+1.820645e+06    2 256
+1.899254e+06    4 256
+### CPU: scaling test 32
+8.705825e+05    1  32
+1.101378e+06    2  32
+1.693816e+06    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
index 2396150f34..7c9c492237 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:54:52
+DATE: 2025-09-24_07:41:13
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.715157e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.495446e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.756115e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.556711e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.831304e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.057101e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.722104 sec
-INFO: No Floating Point Exceptions have been reported
-     2,722,047,064      cycles                           #    2.855 GHz                    
-     4,240,638,296      instructions                     #    1.56  insn per cycle         
-       1.034081868 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.768807 sec
+     2,919,942,359      cycles                           #    2.817 GHz                       
+     4,594,739,998      instructions                     #    1.57  insn per cycle            
+       1.096295578 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 92
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.013288e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.182482e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.182482e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.758023e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.130964e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.130964e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.662912 sec
-INFO: No Floating Point Exceptions have been reported
-    19,208,633,801      cycles                           #    2.880 GHz                    
-    46,193,026,925      instructions                     #    2.40  insn per cycle         
-       6.677929994 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.865866 sec
+    19,669,960,331      cycles                           #    2.864 GHz                       
+    48,570,214,354      instructions                     #    2.47  insn per cycle            
+       6.870925701 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  436) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.534189e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.004053e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.004053e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.385509e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.759182e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.759182e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.563122 sec
-INFO: No Floating Point Exceptions have been reported
-    13,135,626,695      cycles                           #    2.874 GHz                    
-    31,728,680,952      instructions                     #    2.42  insn per cycle         
-       4.573724377 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.964307 sec
+    14,204,686,514      cycles                           #    2.859 GHz                       
+    35,715,585,672      instructions                     #    2.51  insn per cycle            
+       4.969392504 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.938790e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.711147e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.711147e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.974759e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.773943e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.773943e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.694529 sec
-INFO: No Floating Point Exceptions have been reported
-    10,256,024,954      cycles                           #    2.769 GHz                    
-    19,694,743,800      instructions                     #    1.92  insn per cycle         
-       3.707450749 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     3.601424 sec
+     9,848,259,104      cycles                           #    2.732 GHz                       
+    18,994,035,594      instructions                     #    1.93  insn per cycle            
+       3.606544786 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1839) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.944800e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.743029e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.743029e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.049473e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.929995e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.929995e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.690433 sec
-INFO: No Floating Point Exceptions have been reported
-    10,133,821,420      cycles                           #    2.743 GHz                    
-    19,357,887,145      instructions                     #    1.91  insn per cycle         
-       3.703105135 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.484592 sec
+     9,443,607,033      cycles                           #    2.707 GHz                       
+    18,622,238,372      instructions                     #    1.97  insn per cycle            
+       3.489655116 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1716) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.663763e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.201339e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.201339e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.507592e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.925987e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.925987e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.240790 sec
-INFO: No Floating Point Exceptions have been reported
-     8,791,817,571      cycles                           #    2.072 GHz                    
-    15,864,118,825      instructions                     #    1.80  insn per cycle         
-       4.252718180 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.589973 sec
+     9,300,481,242      cycles                           #    2.025 GHz                       
+    14,435,776,634      instructions                     #    1.55  insn per cycle            
+       4.595177363 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1096) (512y:   40) (512z: 1204)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
index 97960252e7..bd44162217 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,252 +10,220 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:36:32
+DATE: 2025-09-24_08:54:03
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.729675e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.983590e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.983590e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.265806e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.377961e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.377961e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     2.228883 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,241,513,211      cycles                           #    2.923 GHz                    
-    12,978,693,777      instructions                     #    1.79  insn per cycle         
-       2.533005072 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     2.391774 sec
+     7,602,481,238      cycles                           #    2.845 GHz                       
+    13,479,408,447      instructions                     #    1.77  insn per cycle            
+       2.730155096 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 92
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.954014e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.154803e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.154803e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.445576e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.088395e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.088395e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.972350 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    20,384,148,235      cycles                           #    2.919 GHz                    
-    46,410,615,309      instructions                     #    2.28  insn per cycle         
-       6.984536194 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.289356 sec
+    20,861,496,468      cycles                           #    2.861 GHz                       
+    48,776,484,238      instructions                     #    2.34  insn per cycle            
+       7.296242635 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  436) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.493408e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.921090e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.921090e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.325510e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.664094e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.664094e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.877492 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    14,402,886,877      cycles                           #    2.946 GHz                    
-    32,567,021,239      instructions                     #    2.26  insn per cycle         
-       4.890045852 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.384133 sec
+    15,443,300,297      cycles                           #    2.865 GHz                       
+    36,497,013,571      instructions                     #    2.36  insn per cycle            
+       5.391017672 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.864025e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.539449e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.539449e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.847889e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.533992e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.533992e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.048395 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    11,503,225,226      cycles                           #    2.834 GHz                    
-    21,048,377,803      instructions                     #    1.83  insn per cycle         
-       4.060868426 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     4.043561 sec
+    11,058,209,453      cycles                           #    2.731 GHz                       
+    20,294,980,684      instructions                     #    1.84  insn per cycle            
+       4.050288399 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1839) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.889652e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.596697e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.596697e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.944286e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.700153e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.700153e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.001389 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    11,334,121,636      cycles                           #    2.824 GHz                    
-    20,717,870,984      instructions                     #    1.83  insn per cycle         
-       4.014529771 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.878834 sec
+    10,652,009,033      cycles                           #    2.742 GHz                       
+    19,921,987,104      instructions                     #    1.87  insn per cycle            
+       3.885646041 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1716) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.585647e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.044820e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.044820e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.429194e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.797955e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.797955e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.655129 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,329,600,614      cycles                           #    2.214 GHz                    
-    17,028,538,054      instructions                     #    1.65  insn per cycle         
-       4.667149794 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     5.047263 sec
+    10,578,425,795      cycles                           #    2.094 GHz                       
+    15,527,346,588      instructions                     #    1.47  insn per cycle            
+       5.054089505 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1096) (512y:   40) (512z: 1204)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
index a07615eec8..58c909e9f4 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:48:44
+DATE: 2025-09-24_09:13:18
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.479194e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.613891e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.774308e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.363761e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.973740e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.252514e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     1.350685 sec
-INFO: No Floating Point Exceptions have been reported
-     4,619,154,070      cycles                           #    2.910 GHz                    
-     7,244,933,472      instructions                     #    1.57  insn per cycle         
-       1.645096659 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     1.467223 sec
+     4,932,154,711      cycles                           #    2.846 GHz                       
+     7,711,580,022      instructions                     #    1.56  insn per cycle            
+       1.792606628 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 92
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.031231e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.202853e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.202853e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.789468e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.133945e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.133945e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     6.892564 sec
-INFO: No Floating Point Exceptions have been reported
-    20,216,212,113      cycles                           #    2.933 GHz                    
-    46,211,289,901      instructions                     #    2.29  insn per cycle         
-       6.898049528 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.217356 sec
+    20,725,832,251      cycles                           #    2.870 GHz                       
+    48,670,692,155      instructions                     #    2.35  insn per cycle            
+       7.222545153 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  436) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.575355e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.054940e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.054940e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.391545e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.764575e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.764575e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.814476 sec
-INFO: No Floating Point Exceptions have been reported
-    14,161,512,947      cycles                           #    2.938 GHz                    
-    31,718,115,030      instructions                     #    2.24  insn per cycle         
-       4.820285845 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.317482 sec
+    15,270,332,925      cycles                           #    2.870 GHz                       
+    35,716,070,561      instructions                     #    2.34  insn per cycle            
+       5.322758311 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.990481e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.780031e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.780031e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.981842e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.797171e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.797171e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.993697 sec
-INFO: No Floating Point Exceptions have been reported
-    11,344,220,574      cycles                           #    2.837 GHz                    
-    19,628,934,109      instructions                     #    1.73  insn per cycle         
-       3.999571252 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     3.963389 sec
+    10,924,799,733      cycles                           #    2.754 GHz                       
+    18,895,724,466      instructions                     #    1.73  insn per cycle            
+       3.968762806 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1839) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.024448e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.841239e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.841239e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.089877e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.986900e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.986900e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     3.944999 sec
-INFO: No Floating Point Exceptions have been reported
-    11,153,243,188      cycles                           #    2.824 GHz                    
-    19,098,861,484      instructions                     #    1.71  insn per cycle         
-       3.950731996 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.804740 sec
+    10,498,672,406      cycles                           #    2.756 GHz                       
+    18,322,189,412      instructions                     #    1.75  insn per cycle            
+       3.809974126 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1716) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.731970e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.289397e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.289397e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.508177e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.928540e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.928540e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371879e-02 +- 3.270020e-06 )  GeV^0
-TOTAL       :     4.507668 sec
-INFO: No Floating Point Exceptions have been reported
-     9,996,448,485      cycles                           #    2.215 GHz                    
-    15,693,646,767      instructions                     #    1.57  insn per cycle         
-       4.513790217 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.968681 sec
+    10,404,827,448      cycles                           #    2.093 GHz                       
+    14,135,132,724      instructions                     #    1.36  insn per cycle            
+       4.974084771 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1096) (512y:   40) (512z: 1204)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
index cf4e1a1e41..c37a287617 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:45:58
+DATE: 2025-09-24_09:09:19
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.516686e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.553796e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.802555e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.361627e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.035071e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.318529e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.994565 sec
-INFO: No Floating Point Exceptions have been reported
-     3,557,200,491      cycles                           #    2.898 GHz                    
-     7,056,373,361      instructions                     #    1.98  insn per cycle         
-       1.285636058 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     1.100243 sec
+     3,874,724,836      cycles                           #    2.841 GHz                       
+     7,570,184,430      instructions                     #    1.95  insn per cycle            
+       1.422395260 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 92
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.036397e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.208868e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.208868e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.804396e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.134896e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.134896e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.480716 sec
-INFO: No Floating Point Exceptions have been reported
-    19,050,518,676      cycles                           #    2.938 GHz                    
-    46,087,808,907      instructions                     #    2.42  insn per cycle         
-       6.486425223 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.834548 sec
+    19,642,417,445      cycles                           #    2.872 GHz                       
+    48,568,603,522      instructions                     #    2.47  insn per cycle            
+       6.839870974 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  436) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.562645e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.044042e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.044042e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.392923e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.767493e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.767493e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.441615 sec
-INFO: No Floating Point Exceptions have been reported
-    13,100,732,544      cycles                           #    2.946 GHz                    
-    31,624,731,275      instructions                     #    2.41  insn per cycle         
-       4.447190414 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.939023 sec
+    14,201,994,217      cycles                           #    2.873 GHz                       
+    35,713,557,240      instructions                     #    2.51  insn per cycle            
+       4.944238524 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.962342e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.741135e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.741135e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.988977e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.808357e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.808357e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.617520 sec
-INFO: No Floating Point Exceptions have been reported
-    10,105,971,200      cycles                           #    2.790 GHz                    
-    19,587,417,861      instructions                     #    1.94  insn per cycle         
-       3.623303854 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     3.580138 sec
+     9,843,208,156      cycles                           #    2.746 GHz                       
+    18,992,852,718      instructions                     #    1.93  insn per cycle            
+       3.585425477 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1839) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.035108e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.854302e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.854302e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.090358e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.998949e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.998949e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.498153 sec
-INFO: No Floating Point Exceptions have been reported
-     9,879,352,969      cycles                           #    2.820 GHz                    
-    19,249,039,766      instructions                     #    1.95  insn per cycle         
-       3.504047287 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.424021 sec
+     9,418,501,088      cycles                           #    2.747 GHz                       
+    18,610,322,009      instructions                     #    1.98  insn per cycle            
+       3.429438977 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1716) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.738426e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.300548e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.300548e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.510092e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.930355e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.930355e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.029997 sec
-INFO: No Floating Point Exceptions have been reported
-     8,617,786,478      cycles                           #    2.136 GHz                    
-    15,755,373,979      instructions                     #    1.83  insn per cycle         
-       4.035885525 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.586814 sec
+     9,309,449,457      cycles                           #    2.028 GHz                       
+    14,433,282,109      instructions                     #    1.55  insn per cycle            
+       4.591899749 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1096) (512y:   40) (512z: 1204)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
index 23a95e9b43..5bde08df61 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,235 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:43:10
+DATE: 2025-09-24_09:05:24
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.035607e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.566958e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.715605e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.425623e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.947117e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.223633e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     1.900996 sec
-INFO: No Floating Point Exceptions have been reported
-     6,141,367,935      cycles                           #    2.877 GHz                    
-    11,470,611,621      instructions                     #    1.87  insn per cycle         
-       2.190401749 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     2.024476 sec
+     6,554,480,812      cycles                           #    2.850 GHz                       
+    12,028,784,772      instructions                     #    1.84  insn per cycle            
+       2.355414793 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 92
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.040250e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.212161e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.212161e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.805260e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.135528e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.135528e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.458941 sec
-INFO: No Floating Point Exceptions have been reported
-    19,062,791,283      cycles                           #    2.949 GHz                    
-    46,091,693,422      instructions                     #    2.42  insn per cycle         
-       6.464859061 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.833543 sec
+    19,634,743,501      cycles                           #    2.872 GHz                       
+    48,568,052,404      instructions                     #    2.47  insn per cycle            
+       6.838759660 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  436) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.576646e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.057103e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.057103e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.393394e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.766191e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.766191e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.406292 sec
-INFO: No Floating Point Exceptions have been reported
-    12,965,800,121      cycles                           #    2.939 GHz                    
-    31,623,980,844      instructions                     #    2.44  insn per cycle         
-       4.412202935 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1662) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.937510 sec
+    14,192,824,862      cycles                           #    2.872 GHz                       
+    35,714,478,545      instructions                     #    2.52  insn per cycle            
+       4.942916419 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1828) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.982815e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.782156e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.782156e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.977434e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.796954e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.796954e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.583602 sec
-INFO: No Floating Point Exceptions have been reported
-    10,107,254,042      cycles                           #    2.816 GHz                    
-    19,587,412,579      instructions                     #    1.94  insn per cycle         
-       3.589639966 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1909) (512y:    0) (512z:    0)
+TOTAL       :     3.598950 sec
+     9,864,189,293      cycles                           #    2.738 GHz                       
+    18,993,120,819      instructions                     #    1.93  insn per cycle            
+       3.604424478 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1839) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.036151e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.856576e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.856576e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.084517e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.984691e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.984691e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.497964 sec
-INFO: No Floating Point Exceptions have been reported
-     9,879,922,849      cycles                           #    2.820 GHz                    
-    19,260,007,955      instructions                     #    1.95  insn per cycle         
-       3.503929332 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1647) (512y:  180) (512z:    0)
+TOTAL       :     3.431804 sec
+     9,421,776,417      cycles                           #    2.742 GHz                       
+    18,621,468,870      instructions                     #    1.98  insn per cycle            
+       3.437118206 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1716) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.741980e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.303561e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.303561e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.507112e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.920261e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.920261e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.020543 sec
-INFO: No Floating Point Exceptions have been reported
-     8,613,807,526      cycles                           #    2.140 GHz                    
-    15,755,294,312      instructions                     #    1.83  insn per cycle         
-       4.026429840 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  871) (512y:  156) (512z: 1258)
+TOTAL       :     4.592567 sec
+     9,283,183,314      cycles                           #    2.020 GHz                       
+    14,433,736,941      instructions                     #    1.55  insn per cycle            
+       4.597725180 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1096) (512y:   40) (512z: 1204)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
index 25ac5b33ed..389d0ed02a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:55:23
+DATE: 2025-09-24_07:41:50
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.275982e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.504846e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.746692e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.491499e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.666545e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.879724e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.698739 sec
-INFO: No Floating Point Exceptions have been reported
-     2,671,543,996      cycles                           #    2.868 GHz                    
-     4,201,680,962      instructions                     #    1.57  insn per cycle         
-       1.042000131 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.765930 sec
+     2,948,048,535      cycles                           #    2.831 GHz                       
+     4,629,826,040      instructions                     #    1.57  insn per cycle            
+       1.098401672 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 80
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.030289e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.210430e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.210430e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.778603e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.132034e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.132034e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.580678 sec
-INFO: No Floating Point Exceptions have been reported
-    19,388,414,039      cycles                           #    2.942 GHz                    
-    46,168,116,276      instructions                     #    2.38  insn per cycle         
-       6.592554583 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  452) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.849593 sec
+    19,698,450,548      cycles                           #    2.874 GHz                       
+    48,569,342,520      instructions                     #    2.47  insn per cycle            
+       6.854647316 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  426) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.571872e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.069657e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.069657e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.390958e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.763981e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.763981e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.454497 sec
-INFO: No Floating Point Exceptions have been reported
-    13,123,917,893      cycles                           #    2.941 GHz                    
-    31,665,954,915      instructions                     #    2.41  insn per cycle         
-       4.468095413 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1648) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.945278 sec
+    14,201,944,495      cycles                           #    2.870 GHz                       
+    35,713,685,555      instructions                     #    2.51  insn per cycle            
+       4.950471310 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1818) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.982748e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.777393e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.777393e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.940285e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.776799e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.776799e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.618164 sec
-INFO: No Floating Point Exceptions have been reported
-    10,210,665,805      cycles                           #    2.814 GHz                    
-    19,682,748,403      instructions                     #    1.93  insn per cycle         
-       3.629801888 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1894) (512y:    0) (512z:    0)
+TOTAL       :     3.658331 sec
+     9,992,108,985      cycles                           #    2.728 GHz                       
+    18,993,420,215      instructions                     #    1.90  insn per cycle            
+       3.663464040 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1821) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165090E-002
 Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.010638e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.831487e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.831487e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.098194e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.007315e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.007315e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.575425 sec
-INFO: No Floating Point Exceptions have been reported
-    10,055,677,244      cycles                           #    2.805 GHz                    
-    19,379,411,405      instructions                     #    1.93  insn per cycle         
-       3.588891240 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1636) (512y:  178) (512z:    0)
+TOTAL       :     3.410724 sec
+     9,387,426,669      cycles                           #    2.749 GHz                       
+    18,611,041,074      instructions                     #    1.98  insn per cycle            
+       3.415936625 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1696) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165090E-002
 Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.768631e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.372427e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.372427e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.511079e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.926773e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.926773e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.003265 sec
-INFO: No Floating Point Exceptions have been reported
-     8,643,505,927      cycles                           #    2.154 GHz                    
-    15,697,303,734      instructions                     #    1.82  insn per cycle         
-       4.017112338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  833) (512y:  153) (512z: 1240)
+TOTAL       :     4.580455 sec
+     9,266,907,927      cycles                           #    2.022 GHz                       
+    14,433,883,234      instructions                     #    1.56  insn per cycle            
+       4.585529206 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1076) (512y:   40) (512z: 1204)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039868165088E-002
-Relative difference = 1.0277089312025782e-08
+Avg ME (F77/C++)    = 1.2828039868165090E-002
+Relative difference = 1.0277089176796747e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
index 9d9181639f..6748decf76 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:26:55
+DATE: 2025-09-24_08:43:46
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.029061e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.569612e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.860356e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.323552e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.961045e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.238128e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.695056 sec
-INFO: No Floating Point Exceptions have been reported
-     2,704,879,803      cycles                           #    2.897 GHz                    
-     4,231,460,596      instructions                     #    1.56  insn per cycle         
-       0.994220648 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.795139 sec
+     3,018,785,307      cycles                           #    2.812 GHz                       
+     4,761,719,789      instructions                     #    1.58  insn per cycle            
+       1.135917551 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 92
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.606609e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.069672e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.069672e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.081749e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.276169e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.276169e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.367451 sec
-INFO: No Floating Point Exceptions have been reported
-    12,912,062,009      cycles                           #    2.950 GHz                    
-    32,678,927,799      instructions                     #    2.53  insn per cycle         
-       4.379017229 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  281) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.239151 sec
+    17,816,549,785      cycles                           #    2.857 GHz                       
+    42,851,983,164      instructions                     #    2.41  insn per cycle            
+       6.244587137 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  378) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.977635e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.819919e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.819919e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.563944e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.054942e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.054942e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.639596 sec
-INFO: No Floating Point Exceptions have been reported
-    10,716,876,159      cycles                           #    2.936 GHz                    
-    25,005,426,831      instructions                     #    2.33  insn per cycle         
-       3.651343591 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1246) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.448005 sec
+    12,743,189,658      cycles                           #    2.863 GHz                       
+    30,332,835,629      instructions                     #    2.38  insn per cycle            
+       4.453458721 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1656) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.209379e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.259757e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.259757e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.047332e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.929117e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.929117e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.300079 sec
-INFO: No Floating Point Exceptions have been reported
-     9,398,178,742      cycles                           #    2.838 GHz                    
-    16,938,114,674      instructions                     #    1.80  insn per cycle         
-       3.311853262 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1599) (512y:    0) (512z:    0)
+TOTAL       :     3.490476 sec
+     9,538,570,843      cycles                           #    2.729 GHz                       
+    17,003,613,920      instructions                     #    1.78  insn per cycle            
+       3.496120642 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1745) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.277311e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.397001e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.397001e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.199768e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.213460e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.213460e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.212263 sec
-INFO: No Floating Point Exceptions have been reported
-     9,139,009,296      cycles                           #    2.835 GHz                    
-    16,502,297,129      instructions                     #    1.81  insn per cycle         
-       3.223908096 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1355) (512y:  139) (512z:    0)
+TOTAL       :     3.274613 sec
+     8,968,933,674      cycles                           #    2.735 GHz                       
+    16,216,764,925      instructions                     #    1.81  insn per cycle            
+       3.280120332 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1571) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.921368e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.661482e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.661482e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.524186e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.949184e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.949184e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.727052 sec
-INFO: No Floating Point Exceptions have been reported
-     8,146,634,535      cycles                           #    2.180 GHz                    
-    14,661,732,896      instructions                     #    1.80  insn per cycle         
-       3.738643291 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1003) (512y:  158) (512z:  946)
+TOTAL       :     4.546459 sec
+     9,195,001,582      cycles                           #    2.021 GHz                       
+    13,328,215,793      instructions                     #    1.45  insn per cycle            
+       4.551986341 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  850) (512y:   32) (512z: 1193)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
index abe54e8953..16f206046e 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:27:22
+DATE: 2025-09-24_08:44:21
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.921706e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.715910e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.877358e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.209909e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.602152e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.846272e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.678632 sec
-INFO: No Floating Point Exceptions have been reported
-     2,636,898,249      cycles                           #    2.884 GHz                    
-     4,067,260,892      instructions                     #    1.54  insn per cycle         
-       0.973352356 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.791421 sec
+     2,992,715,233      cycles                           #    2.825 GHz                       
+     4,674,908,446      instructions                     #    1.56  insn per cycle            
+       1.118373682 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 80
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 18
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039868165201E-002
 Relative difference = 1.0277080522138477e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_d_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.084164e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.941928e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.941928e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.088576e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.283204e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.283204e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.469758 sec
-INFO: No Floating Point Exceptions have been reported
-    10,217,900,291      cycles                           #    2.936 GHz                    
-    25,614,437,724      instructions                     #    2.51  insn per cycle         
-       3.480862891 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  236) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.198929 sec
+    17,796,756,971      cycles                           #    2.869 GHz                       
+    42,710,158,130      instructions                     #    2.40  insn per cycle            
+       6.204593111 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  379) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.313032e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.558172e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.558172e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.587243e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.096649e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.096649e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.173341 sec
-INFO: No Floating Point Exceptions have been reported
-     9,354,473,123      cycles                           #    2.939 GHz                    
-    21,650,720,885      instructions                     #    2.31  insn per cycle         
-       3.184272296 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1112) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.387338 sec
+    12,598,423,061      cycles                           #    2.869 GHz                       
+    29,976,674,000      instructions                     #    2.38  insn per cycle            
+       4.392925762 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1638) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868164916E-002
 Relative difference = 1.0277102699700292e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.358550e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.604458e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.604458e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.057092e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.937431e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.937431e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.115986 sec
-INFO: No Floating Point Exceptions have been reported
-     8,850,186,465      cycles                           #    2.831 GHz                    
-    16,062,849,181      instructions                     #    1.81  insn per cycle         
-       3.126797345 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1497) (512y:    0) (512z:    0)
+TOTAL       :     3.475327 sec
+     9,528,175,479      cycles                           #    2.738 GHz                       
+    16,858,022,582      instructions                     #    1.77  insn per cycle            
+       3.481000779 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1700) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.422935e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.724037e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.724037e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.212448e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.240328e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.240328e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.043301 sec
-INFO: No Floating Point Exceptions have been reported
-     8,651,791,606      cycles                           #    2.834 GHz                    
-    15,666,461,627      instructions                     #    1.81  insn per cycle         
-       3.054177777 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1264) (512y:  141) (512z:    0)
+TOTAL       :     3.258089 sec
+     8,944,166,150      cycles                           #    2.741 GHz                       
+    16,099,275,495      instructions                     #    1.80  insn per cycle            
+       3.263523085 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1529) (512y:   20) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.052275e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.908416e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.908416e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.536323e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.970964e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.970964e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.514529 sec
-INFO: No Floating Point Exceptions have been reported
-     7,791,531,975      cycles                           #    2.211 GHz                    
-    14,393,714,103      instructions                     #    1.85  insn per cycle         
-       3.525649878 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1031) (512y:  164) (512z:  876)
+TOTAL       :     4.514695 sec
+     9,168,428,013      cycles                           #    2.029 GHz                       
+    13,276,409,493      instructions                     #    1.45  insn per cycle            
+       4.520383256 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  812) (512y:   32) (512z: 1193)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039868165088E-002
 Relative difference = 1.0277089312025782e-08
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..d210e40b75
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+DATE: 2025-09-24_08:16:33
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.995915e+06    1 256
+4.200509e+06    2 256
+8.083870e+06    4 256
+1.608305e+07    8 256
+3.127720e+07   16 256
+4.266755e+07   32 256
+7.804246e+07   64 256
+8.515747e+07  128 256
+9.772246e+07  256 256
+1.061822e+08  512 256
+1.118796e+08 1024 256
+### GPU: scaling test 32
+3.184492e+05    1  32
+6.285479e+05    2  32
+1.262626e+06    4  32
+2.123354e+06    8  32
+4.536835e+06   16  32
+8.483985e+06   32  32
+1.721211e+07   64  32
+3.215777e+07  128  32
+5.609384e+07  256  32
+7.417904e+07  512  32
+9.275336e+07 1024  32
+9.687981e+07 2048  32
+1.055152e+08 4096  32
+1.093946e+08 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.077237e+06    1 256
+1.027248e+06    2 256
+1.058976e+06    4 256
+### CPU: scaling test 32
+1.051456e+06    1  32
+1.083149e+06    2  32
+1.016704e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.464572e+06    1 256
+2.473478e+06    2 256
+2.516181e+06    4 256
+### CPU: scaling test 32
+2.544529e+06    1  32
+1.386963e+06    2  32
+2.294853e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.225139e+06    1 256
+4.673750e+06    2 256
+5.300893e+06    4 256
+### CPU: scaling test 32
+1.913876e+06    1  32
+2.472284e+06    2  32
+3.577518e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.736952e+06    1 256
+4.935987e+06    2 256
+5.245257e+06    4 256
+### CPU: scaling test 32
+1.886125e+06    1  32
+4.425697e+06    2  32
+3.692805e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.704132e+06    1 256
+3.924725e+06    2 256
+4.057052e+06    4 256
+### CPU: scaling test 32
+4.436434e+06    1  32
+4.954711e+06    2  32
+3.376418e+06    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
index fa697401ba..2544ccf168 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:56:56
+DATE: 2025-09-24_07:43:43
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.318402e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.547340e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.573294e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.831331e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.147275e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.192627e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.580539 sec
-INFO: No Floating Point Exceptions have been reported
-     2,318,735,379      cycles                           #    2.865 GHz                    
-     3,612,120,055      instructions                     #    1.56  insn per cycle         
-       0.879357898 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.629958 sec
+     2,533,063,587      cycles                           #    2.828 GHz                       
+     3,919,306,559      instructions                     #    1.55  insn per cycle            
+       0.954740453 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.072197e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.275533e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275533e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.002594e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.174243e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.174243e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.274259 sec
-INFO: No Floating Point Exceptions have been reported
-    18,464,131,410      cycles                           #    2.940 GHz                    
-    45,058,020,075      instructions                     #    2.44  insn per cycle         
-       6.281329583 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.656687 sec
+    19,080,722,877      cycles                           #    2.866 GHz                       
+    47,603,966,972      instructions                     #    2.49  insn per cycle            
+       6.661349360 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  414) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.257463e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.446957e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.446957e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.997743e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.907583e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.907583e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.180369 sec
-INFO: No Floating Point Exceptions have been reported
-     9,372,467,471      cycles                           #    2.941 GHz                    
-    22,319,965,268      instructions                     #    2.38  insn per cycle         
-       3.189536232 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.532505 sec
+    10,129,083,857      cycles                           #    2.864 GHz                       
+    24,678,199,043      instructions                     #    2.44  insn per cycle            
+       3.538004837 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039280066150E-002
-Relative difference = 5.612189004572479e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.408379e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.710073e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.710073e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.020145e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.374335e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.374335e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.001033 sec
-INFO: No Floating Point Exceptions have been reported
-     8,493,792,111      cycles                           #    2.825 GHz                    
-    15,797,222,111      instructions                     #    1.86  insn per cycle         
-       3.010052254 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     2.458457 sec
+     6,780,782,058      cycles                           #    2.757 GHz                       
+    13,852,462,300      instructions                     #    2.04  insn per cycle            
+       2.463155894 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2178) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.426130e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.768067e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.768067e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.125396e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.710524e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.710524e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.984448 sec
-INFO: No Floating Point Exceptions have been reported
-     8,427,466,763      cycles                           #    2.818 GHz                    
-    15,640,000,146      instructions                     #    1.86  insn per cycle         
-       2.993491493 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     2.391515 sec
+     6,609,679,244      cycles                           #    2.759 GHz                       
+    13,637,397,629      instructions                     #    2.06  insn per cycle            
+       2.396879313 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2078) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.427110e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.709739e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.709739e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.559405e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.068028e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.068028e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.984139 sec
-INFO: No Floating Point Exceptions have been reported
-     6,725,622,216      cycles                           #    2.249 GHz                    
-    12,910,486,373      instructions                     #    1.92  insn per cycle         
-       2.994013668 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     2.839980 sec
+     6,297,156,841      cycles                           #    2.214 GHz                       
+    11,613,908,330      instructions                     #    1.84  insn per cycle            
+       2.845523276 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1554) (512y:    0) (512z: 1309)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052585973637E-002
-Relative difference = 2.0158743040564767e-07
+Avg ME (F77/C++)    = 1.2828052638724330E-002
+Relative difference = 2.0569956691141665e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
index 9136826931..2d596cea9c 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,255 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:37:07
+DATE: 2025-09-24_08:54:44
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.256593e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.121486e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.121486e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.537867e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.260892e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.260892e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.691319 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,610,006,933      cycles                           #    2.911 GHz                    
-    10,218,919,767      instructions                     #    1.82  insn per cycle         
-       1.984436466 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     1.800983 sec
+     5,877,978,226      cycles                           #    2.848 GHz                       
+    10,604,089,001      instructions                     #    1.80  insn per cycle            
+       2.121531688 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.060836e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.248384e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.248384e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.857283e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.151088e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.151088e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.418392 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    18,916,088,440      cycles                           #    2.945 GHz                    
-    45,156,650,630      instructions                     #    2.39  insn per cycle         
-       6.425565221 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.870612 sec
+    19,709,472,888      cycles                           #    2.867 GHz                       
+    47,719,122,117      instructions                     #    2.42  insn per cycle            
+       6.877276018 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  414) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.163234e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.223206e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.223206e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.916311e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.741610e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.741610e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.414716 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,073,193,872      cycles                           #    2.945 GHz                    
-    23,610,645,909      instructions                     #    2.34  insn per cycle         
-       3.421707000 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.794312 sec
+    10,876,376,303      cycles                           #    2.862 GHz                       
+    25,980,668,027      instructions                     #    2.39  insn per cycle            
+       3.800880902 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039280066150E-002
-Relative difference = 5.612189004572479e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.302389e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.467769e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.467769e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.867500e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.917538e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.917538e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.241454 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     9,215,037,610      cycles                           #    2.837 GHz                    
-    16,874,646,512      instructions                     #    1.83  insn per cycle         
-       3.248598680 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     2.702979 sec
+     7,478,549,066      cycles                           #    2.761 GHz                       
+    14,938,162,567      instructions                     #    2.00  insn per cycle            
+       2.709529742 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2178) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.316990e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.533576e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.533576e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.974162e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.230625e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.230625e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.224710 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     9,166,209,661      cycles                           #    2.837 GHz                    
-    16,710,284,997      instructions                     #    1.82  insn per cycle         
-       3.231713030 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     2.623723 sec
+     7,304,049,284      cycles                           #    2.778 GHz                       
+    14,726,726,525      instructions                     #    2.02  insn per cycle            
+       2.630269239 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2078) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --bridge OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.333210e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.469405e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.469405e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.438003e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.763905e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.763905e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     3.205451 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,432,634,328      cycles                           #    2.315 GHz                    
-    14,074,642,515      instructions                     #    1.89  insn per cycle         
-       3.212353581 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     3.095394 sec
+     7,047,884,451      cycles                           #    2.273 GHz                       
+    12,784,032,160      instructions                     #    1.81  insn per cycle            
+       3.101946602 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1554) (512y:    0) (512z: 1309)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052585973637E-002
-Relative difference = 2.0158743040564767e-07
+Avg ME (F77/C++)    = 1.2828052638724330E-002
+Relative difference = 2.0569956691141665e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
index 3c8228d85b..0c2fad6cf5 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:49:18
+DATE: 2025-09-24_09:13:58
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.233592e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.244967e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.184868e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.359598e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.131777e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.186519e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371863e-02 +- 3.269951e-06 )  GeV^0
-TOTAL       :     1.220966 sec
-INFO: No Floating Point Exceptions have been reported
-     4,183,681,416      cycles                           #    2.867 GHz                    
-     6,662,508,205      instructions                     #    1.59  insn per cycle         
-       1.516447212 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     1.272802 sec
+     4,352,229,517      cycles                           #    2.840 GHz                       
+     6,942,191,399      instructions                     #    1.60  insn per cycle            
+       1.588683612 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.080178e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.275874e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.275874e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.002593e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.173999e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.173999e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270267e-06 )  GeV^0
-TOTAL       :     6.537773 sec
-INFO: No Floating Point Exceptions have been reported
-    19,269,764,932      cycles                           #    2.946 GHz                    
-    45,190,617,795      instructions                     #    2.35  insn per cycle         
-       6.543013626 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.000453 sec
+    20,098,479,833      cycles                           #    2.870 GHz                       
+    47,788,547,293      instructions                     #    2.38  insn per cycle            
+       7.005540408 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  414) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.263942e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.453881e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.453881e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.994510e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.919066e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.919066e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371887e-02 +- 3.270266e-06 )  GeV^0
-TOTAL       :     3.487545 sec
-INFO: No Floating Point Exceptions have been reported
-    10,298,424,695      cycles                           #    2.949 GHz                    
-    22,355,388,978      instructions                     #    2.17  insn per cycle         
-       3.493059791 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.881179 sec
+    11,140,538,881      cycles                           #    2.868 GHz                       
+    24,759,550,994      instructions                     #    2.22  insn per cycle            
+       3.886423602 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039280066150E-002
-Relative difference = 5.612189004572479e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.406924e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.701531e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.701531e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.035056e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.439882e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.439882e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.322767 sec
-INFO: No Floating Point Exceptions have been reported
-     9,443,809,325      cycles                           #    2.838 GHz                    
-    15,664,102,195      instructions                     #    1.66  insn per cycle         
-       3.328357008 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     2.795045 sec
+     7,779,386,268      cycles                           #    2.779 GHz                       
+    13,765,224,553      instructions                     #    1.77  insn per cycle            
+       2.800026900 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2178) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.446360e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.803645e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.803645e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.124621e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.719457e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.719457e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.287328 sec
-INFO: No Floating Point Exceptions have been reported
-     9,371,124,961      cycles                           #    2.847 GHz                    
-    15,299,944,141      instructions                     #    1.63  insn per cycle         
-       3.292839828 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     2.744834 sec
+     7,614,459,531      cycles                           #    2.770 GHz                       
+    13,352,845,576      instructions                     #    1.75  insn per cycle            
+       2.749922574 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2078) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --common OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.466708e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.777222e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.777222e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.573065e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.100608e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.100608e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371885e-02 +- 3.270112e-06 )  GeV^0
-TOTAL       :     3.269312 sec
-INFO: No Floating Point Exceptions have been reported
-     7,659,274,117      cycles                           #    2.340 GHz                    
-    12,573,895,764      instructions                     #    1.64  insn per cycle         
-       3.274843213 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     3.180653 sec
+     7,312,343,971      cycles                           #    2.297 GHz                       
+    11,323,410,817      instructions                     #    1.55  insn per cycle            
+       3.185765312 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1554) (512y:    0) (512z: 1309)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052585973637E-002
-Relative difference = 2.0158743040564767e-07
+Avg ME (F77/C++)    = 1.2828052638724330E-002
+Relative difference = 2.0569956691141665e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
index 7f30dafdfd..912bbfb1fe 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:46:29
+DATE: 2025-09-24_09:09:56
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.282321e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.333955e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.369324e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.383056e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.141193e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.196294e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.867789 sec
-INFO: No Floating Point Exceptions have been reported
-     3,167,199,789      cycles                           #    2.899 GHz                    
-     6,506,216,930      instructions                     #    2.05  insn per cycle         
-       1.149942283 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.930327 sec
+     3,356,938,087      cycles                           #    2.833 GHz                       
+     6,810,076,509      instructions                     #    2.03  insn per cycle            
+       1.242803897 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.085219e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.281583e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.281583e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.003317e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.175897e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.175897e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.166056 sec
-INFO: No Floating Point Exceptions have been reported
-    18,234,644,828      cycles                           #    2.955 GHz                    
-    45,008,398,832      instructions                     #    2.47  insn per cycle         
-       6.171760600 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.648480 sec
+    19,101,943,380      cycles                           #    2.872 GHz                       
+    47,607,190,590      instructions                     #    2.49  insn per cycle            
+       6.653713406 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  414) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.256894e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.462086e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.462086e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.002586e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.913648e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.913648e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.159870 sec
-INFO: No Floating Point Exceptions have been reported
-     9,347,982,513      cycles                           #    2.954 GHz                    
-    22,275,896,372      instructions                     #    2.38  insn per cycle         
-       3.165402193 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.525616 sec
+    10,120,619,743      cycles                           #    2.867 GHz                       
+    24,678,227,678      instructions                     #    2.44  insn per cycle            
+       3.530850186 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039280066150E-002
-Relative difference = 5.612189004572479e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.410366e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.712636e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.712636e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.030738e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.420413e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.420413e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.983322 sec
-INFO: No Floating Point Exceptions have been reported
-     8,463,194,185      cycles                           #    2.833 GHz                    
-    15,755,395,679      instructions                     #    1.86  insn per cycle         
-       2.988746216 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     2.452320 sec
+     6,811,344,176      cycles                           #    2.773 GHz                       
+    13,854,164,765      instructions                     #    2.03  insn per cycle            
+       2.457451990 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2178) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.454105e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.801490e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.801490e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.150584e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.795938e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.795938e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.933599 sec
-INFO: No Floating Point Exceptions have been reported
-     8,319,397,972      cycles                           #    2.832 GHz                    
-    15,593,973,322      instructions                     #    1.87  insn per cycle         
-       2.939101584 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     2.374395 sec
+     6,591,965,501      cycles                           #    2.771 GHz                       
+    13,642,240,290      instructions                     #    2.07  insn per cycle            
+       2.379679090 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2078) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --curhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.469652e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.768397e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.768397e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.575282e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.089269e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.089269e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.922384 sec
-INFO: No Floating Point Exceptions have been reported
-     6,636,368,959      cycles                           #    2.267 GHz                    
-    12,865,256,567      instructions                     #    1.94  insn per cycle         
-       2.927905791 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     2.822750 sec
+     6,268,766,960      cycles                           #    2.218 GHz                       
+    11,613,826,957      instructions                     #    1.85  insn per cycle            
+       2.828064655 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1554) (512y:    0) (512z: 1309)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052585973637E-002
-Relative difference = 2.0158743040564767e-07
+Avg ME (F77/C++)    = 1.2828052638724330E-002
+Relative difference = 2.0569956691141665e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
index e2ecb9b5fd..0463fd0c8a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,238 +10,219 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:43:42
+DATE: 2025-09-24_09:06:05
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 12 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.979354e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.311142e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.251832e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.112738e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.116501e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.169674e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371710e-02 +- 3.270389e-06 )  GeV^0
-TOTAL       :     1.493081 sec
-INFO: No Floating Point Exceptions have been reported
-     5,009,051,141      cycles                           #    2.916 GHz                    
-     9,204,393,500      instructions                     #    1.84  insn per cycle         
-       1.774548277 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     1.583873 sec
+     5,234,229,196      cycles                           #    2.846 GHz                       
+     9,549,805,518      instructions                     #    1.82  insn per cycle            
+       1.896392697 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.077151e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.276926e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.276926e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.002184e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.173237e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.173237e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.211513 sec
-INFO: No Floating Point Exceptions have been reported
-    18,299,232,198      cycles                           #    2.944 GHz                    
-    45,005,768,829      instructions                     #    2.46  insn per cycle         
-       6.217115880 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  411) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.663048 sec
+    19,100,313,751      cycles                           #    2.867 GHz                       
+    47,607,216,586      instructions                     #    2.49  insn per cycle            
+       6.669193607 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  414) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.268380e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.460029e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.460029e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.002286e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.921109e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.921109e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.148224 sec
-INFO: No Floating Point Exceptions have been reported
-     9,293,240,022      cycles                           #    2.948 GHz                    
-    22,275,553,802      instructions                     #    2.40  insn per cycle         
-       3.153857529 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1954) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.525124 sec
+    10,134,330,212      cycles                           #    2.871 GHz                       
+    24,677,538,220      instructions                     #    2.44  insn per cycle            
+       3.530302654 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2237) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039280066150E-002
-Relative difference = 5.612189004572479e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.395770e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.675698e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.675698e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.022088e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.399929e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.399929e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.994942 sec
-INFO: No Floating Point Exceptions have been reported
-     8,447,981,393      cycles                           #    2.817 GHz                    
-    15,754,576,494      instructions                     #    1.86  insn per cycle         
-       3.000419944 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2565) (512y:    0) (512z:    0)
+TOTAL       :     2.457277 sec
+     6,793,614,730      cycles                           #    2.760 GHz                       
+    13,854,443,684      instructions                     #    2.04  insn per cycle            
+       2.462524480 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2178) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.419912e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.751119e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.751119e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.144844e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.775686e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.775686e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.971435 sec
-INFO: No Floating Point Exceptions have been reported
-     8,357,800,499      cycles                           #    2.808 GHz                    
-    15,594,139,449      instructions                     #    1.87  insn per cycle         
-       2.977163262 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2462) (512y:   12) (512z:    0)
+TOTAL       :     2.377475 sec
+     6,596,944,634      cycles                           #    2.770 GHz                       
+    13,642,560,323      instructions                     #    2.07  insn per cycle            
+       2.382462188 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2078) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 12 --rmbhst OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.455367e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.730952e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.730952e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.581109e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.109083e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.109083e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.933639 sec
-INFO: No Floating Point Exceptions have been reported
-     6,669,997,057      cycles                           #    2.271 GHz                    
-    12,867,351,511      instructions                     #    1.93  insn per cycle         
-       2.938851588 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1698) (512y:   16) (512z: 1440)
+TOTAL       :     2.817117 sec
+     6,281,174,018      cycles                           #    2.226 GHz                       
+    11,613,521,783      instructions                     #    1.85  insn per cycle            
+       2.822306157 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1554) (512y:    0) (512z: 1309)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052585973637E-002
-Relative difference = 2.0158743040564767e-07
+Avg ME (F77/C++)    = 1.2828052638724330E-002
+Relative difference = 2.0569956691141665e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
index 9e915de581..895d76fe30 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:57:23
+DATE: 2025-09-24_07:44:14
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.310707e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.890276e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.030864e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.359826e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.021023e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056478e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.577005 sec
-INFO: No Floating Point Exceptions have been reported
-     2,340,023,876      cycles                           #    2.880 GHz                    
-     3,638,052,704      instructions                     #    1.55  insn per cycle         
-       0.886148283 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 79
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.631340 sec
+     2,532,233,300      cycles                           #    2.830 GHz                       
+     3,970,550,794      instructions                     #    1.57  insn per cycle            
+       0.951025749 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 40
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.074456e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.269687e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.269687e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.000634e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.172912e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.172912e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     6.244443 sec
-INFO: No Floating Point Exceptions have been reported
-    18,377,232,357      cycles                           #    2.941 GHz                    
-    45,025,324,964      instructions                     #    2.45  insn per cycle         
-       6.253002386 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  397) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.666988 sec
+    19,115,716,767      cycles                           #    2.867 GHz                       
+    47,603,231,786      instructions                     #    2.49  insn per cycle            
+       6.671571279 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  400) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039854866802E-002
 Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.251309e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.439034e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.439034e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.995961e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.913123e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.913123e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.184453 sec
-INFO: No Floating Point Exceptions have been reported
-     9,383,250,913      cycles                           #    2.940 GHz                    
-    22,280,358,761      instructions                     #    2.37  insn per cycle         
-       3.194375038 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1935) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.534067 sec
+    10,148,756,573      cycles                           #    2.868 GHz                       
+    24,677,225,829      instructions                     #    2.43  insn per cycle            
+       3.539611093 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2223) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039280066150E-002
-Relative difference = 5.612189004572479e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.403334e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.700033e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.700033e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.044338e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.456169e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.456169e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     3.004384 sec
-INFO: No Floating Point Exceptions have been reported
-     8,513,730,278      cycles                           #    2.827 GHz                    
-    15,791,909,505      instructions                     #    1.85  insn per cycle         
-       3.013283160 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2540) (512y:    0) (512z:    0)
+TOTAL       :     2.445579 sec
+     6,775,772,019      cycles                           #    2.766 GHz                       
+    13,851,023,344      instructions                     #    2.04  insn per cycle            
+       2.450357325 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2150) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.444935e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.799463e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.799463e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.145847e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.775490e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.775490e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.959849 sec
-INFO: No Floating Point Exceptions have been reported
-     8,395,161,248      cycles                           #    2.830 GHz                    
-    15,634,676,534      instructions                     #    1.86  insn per cycle         
-       2.968734397 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2438) (512y:   10) (512z:    0)
+TOTAL       :     2.378684 sec
+     6,605,443,400      cycles                           #    2.772 GHz                       
+    13,641,787,060      instructions                     #    2.07  insn per cycle            
+       2.384046135 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2048) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053255361738E-002
-Relative difference = 2.5376902468575066e-07
+Avg ME (F77/C++)    = 1.2828053228076897E-002
+Relative difference = 2.5164205752759426e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.454317e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.767111e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.767111e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.572831e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.087699e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.087699e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.955150 sec
-INFO: No Floating Point Exceptions have been reported
-     6,701,822,130      cycles                           #    2.263 GHz                    
-    12,886,633,037      instructions                     #    1.92  insn per cycle         
-       2.963931226 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1669) (512y:   16) (512z: 1427)
+TOTAL       :     2.828624 sec
+     6,290,623,334      cycles                           #    2.221 GHz                       
+    11,612,779,469      instructions                     #    1.85  insn per cycle            
+       2.833896598 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1524) (512y:    0) (512z: 1309)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052564145764E-002
-Relative difference = 1.9988585667912256e-07
+Avg ME (F77/C++)    = 1.2828052638724330E-002
+Relative difference = 2.0569956691141665e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
index 1fabc46555..c7a807c8b1 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:27:46
+DATE: 2025-09-24_08:44:57
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.309386e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.516838e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.621181e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.330596e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.123948e+08                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.177971e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.581015 sec
-INFO: No Floating Point Exceptions have been reported
-     2,337,717,863      cycles                           #    2.893 GHz                    
-     3,666,959,770      instructions                     #    1.57  insn per cycle         
-       0.866189287 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 109
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.636606 sec
+     2,522,939,985      cycles                           #    2.819 GHz                       
+     3,982,740,936      instructions                     #    1.58  insn per cycle            
+       0.955499259 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.617887e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.109367e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.109367e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.115184e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.332866e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.332866e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     4.275933 sec
-INFO: No Floating Point Exceptions have been reported
-    12,412,341,686      cycles                           #    2.900 GHz                    
-    32,352,281,163      instructions                     #    2.61  insn per cycle         
-       4.283041784 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  290) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.020502 sec
+    17,288,157,810      cycles                           #    2.870 GHz                       
+    43,112,026,250      instructions                     #    2.49  insn per cycle            
+       6.025655106 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039840314887E-002
-Relative difference = 1.244813035273009e-08
+Avg ME (F77/C++)    = 1.2828039854866802E-002
+Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.642717e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.471061e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.471061e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.160220e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.264225e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.264225e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.775228 sec
-INFO: No Floating Point Exceptions have been reported
-     8,161,861,180      cycles                           #    2.934 GHz                    
-    18,732,698,985      instructions                     #    2.30  insn per cycle         
-       2.782796507 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1534) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.296022 sec
+     9,453,035,600      cycles                           #    2.865 GHz                       
+    22,254,994,493      instructions                     #    2.35  insn per cycle            
+       3.301184224 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2055) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039283704129E-002
-Relative difference = 5.583829420356249e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.771950e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.635210e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.635210e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.147395e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.794407e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.794407e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.653741 sec
-INFO: No Floating Point Exceptions have been reported
-     7,565,022,779      cycles                           #    2.844 GHz                    
-    14,293,093,213      instructions                     #    1.89  insn per cycle         
-       2.661141426 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2234) (512y:    0) (512z:    0)
+TOTAL       :     2.374550 sec
+     6,593,450,184      cycles                           #    2.772 GHz                       
+    12,791,908,931      instructions                     #    1.94  insn per cycle            
+       2.379938233 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1955) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053244447801E-002
-Relative difference = 2.5291823782248813e-07
+Avg ME (F77/C++)    = 1.2828053249904769E-002
+Relative difference = 2.5334363125411937e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.799741e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.762487e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.762487e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.287553e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.275464e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.275464e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.634363 sec
-INFO: No Floating Point Exceptions have been reported
-     7,504,285,407      cycles                           #    2.842 GHz                    
-    13,994,355,792      instructions                     #    1.86  insn per cycle         
-       2.641913370 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2087) (512y:    3) (512z:    0)
+TOTAL       :     2.292894 sec
+     6,391,667,866      cycles                           #    2.782 GHz                       
+    12,449,157,303      instructions                     #    1.95  insn per cycle            
+       2.298194570 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1817) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053244447801E-002
-Relative difference = 2.5291823782248813e-07
+Avg ME (F77/C++)    = 1.2828053249904769E-002
+Relative difference = 2.5334363125411937e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.507958e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.890935e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.890935e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.627084e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.223395e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.223395e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.900923 sec
-INFO: No Floating Point Exceptions have been reported
-     6,641,718,947      cycles                           #    2.284 GHz                    
-    13,481,348,782      instructions                     #    2.03  insn per cycle         
-       2.908502130 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2073) (512y:    1) (512z: 1201)
+TOTAL       :     2.779339 sec
+     6,199,704,984      cycles                           #    2.227 GHz                       
+    11,064,577,835      instructions                     #    1.78  insn per cycle            
+       2.784571382 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1175) (512y:    0) (512z: 1267)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052562326775E-002
-Relative difference = 1.997440588685788e-07
+Avg ME (F77/C++)    = 1.2828052595068584E-002
+Relative difference = 2.0229641945836646e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
index ddc690e546..217e31f76a 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_09:28:10
+DATE: 2025-09-24_08:45:27
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.311525e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.893939e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.130206e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.890476e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.949885e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.037118e+08                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371687e-02 +- 3.270220e-06 )  GeV^0
-TOTAL       :     0.580982 sec
-INFO: No Floating Point Exceptions have been reported
-     2,326,498,884      cycles                           #    2.887 GHz                    
-     3,595,400,053      instructions                     #    1.55  insn per cycle         
-       0.865243472 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 79
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.647806 sec
+     2,556,041,252      cycles                           #    2.823 GHz                       
+     3,995,093,038      instructions                     #    1.56  insn per cycle            
+       0.964151817 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 40
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 40
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282802e-02
-Avg ME (F77/GPU)   = 1.2828112125134794E-002
-Relative difference = 7.1815552823662555e-06
+Avg ME (F77/GPU)   = 1.2828112132410752E-002
+Relative difference = 7.1821224749348815e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_f_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.199736e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.210916e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.210916e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.130758e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.354394e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.354394e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     3.247253 sec
-INFO: No Floating Point Exceptions have been reported
-     9,460,485,661      cycles                           #    2.907 GHz                    
-    25,749,028,052      instructions                     #    2.72  insn per cycle         
-       3.254869601 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  243) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.943582 sec
+    17,072,881,150      cycles                           #    2.871 GHz                       
+    42,123,289,663      instructions                     #    2.47  insn per cycle            
+       5.948757824 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  386) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039838495897E-002
-Relative difference = 1.2589928273811243e-08
+Avg ME (F77/C++)    = 1.2828039854866802E-002
+Relative difference = 1.1313746984080878e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.982142e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.480555e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.480555e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.181674e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.305409e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.305409e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371707e-02 +- 3.270376e-06 )  GeV^0
-TOTAL       :     2.498717 sec
-INFO: No Floating Point Exceptions have been reported
-     7,385,528,393      cycles                           #    2.949 GHz                    
-    16,812,365,380      instructions                     #    2.28  insn per cycle         
-       2.506313604 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1311) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.266811 sec
+     9,398,737,688      cycles                           #    2.874 GHz                       
+    22,004,483,420      instructions                     #    2.34  insn per cycle            
+       3.271919523 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2024) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
-Avg ME (F77/C++)    = 1.2828039280066150E-002
-Relative difference = 5.612189004572479e-08
+Avg ME (F77/C++)    = 1.2828039330997854E-002
+Relative difference = 5.215154825545255e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.917887e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.065921e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.065921e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.139939e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.774728e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.774728e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.542096 sec
-INFO: No Floating Point Exceptions have been reported
-     7,260,793,625      cycles                           #    2.848 GHz                    
-    13,703,433,227      instructions                     #    1.89  insn per cycle         
-       2.549878549 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2067) (512y:    0) (512z:    0)
+TOTAL       :     2.381212 sec
+     6,591,131,680      cycles                           #    2.763 GHz                       
+    12,722,446,784      instructions                     #    1.93  insn per cycle            
+       2.386360015 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1900) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053220800939E-002
-Relative difference = 2.5107486628541925e-07
+Avg ME (F77/C++)    = 1.2828053264456685E-002
+Relative difference = 2.5447801373846945e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.947392e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.166768e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.166768e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.297363e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.310074e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.310074e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270341e-06 )  GeV^0
-TOTAL       :     2.537410 sec
-INFO: No Floating Point Exceptions have been reported
-     7,253,478,894      cycles                           #    2.851 GHz                    
-    13,505,585,795      instructions                     #    1.86  insn per cycle         
-       2.545044336 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1935) (512y:    7) (512z:    0)
+TOTAL       :     2.286925 sec
+     6,344,873,106      cycles                           #    2.769 GHz                       
+    12,379,326,234      instructions                     #    1.95  insn per cycle            
+       2.292021921 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1762) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828053220800939E-002
-Relative difference = 2.5107486628541925e-07
+Avg ME (F77/C++)    = 1.2828053264456685E-002
+Relative difference = 2.5447801373846945e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.612725e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.139660e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.139660e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.631209e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.246457e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.246457e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270342e-06 )  GeV^0
-TOTAL       :     2.798296 sec
-INFO: No Floating Point Exceptions have been reported
-     6,447,529,861      cycles                           #    2.298 GHz                    
-    13,215,855,857      instructions                     #    2.05  insn per cycle         
-       2.806480502 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2039) (512y:    2) (512z: 1081)
+TOTAL       :     2.775253 sec
+     6,198,296,648      cycles                           #    2.230 GHz                       
+    11,034,050,222      instructions                     #    1.78  insn per cycle            
+       2.780602894 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1127) (512y:    0) (512z: 1264)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282805e-02
-Avg ME (F77/C++)    = 1.2828052536860923E-002
-Relative difference = 1.977588895209662e-07
+Avg ME (F77/C++)    = 1.2828052620534436E-002
+Relative difference = 2.0428158880597908e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..f6314e4b0d
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+
+DATE: 2025-09-24_08:16:12
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.719679e+06    1 256
+3.735508e+06    2 256
+7.293707e+06    4 256
+1.360978e+07    8 256
+2.685655e+07   16 256
+4.383703e+07   32 256
+4.540856e+07   64 256
+4.950126e+07  128 256
+5.330683e+07  256 256
+5.631852e+07  512 256
+5.657810e+07 1024 256
+### GPU: scaling test 32
+3.006756e+05    1  32
+6.061812e+05    2  32
+1.142113e+06    4  32
+2.056274e+06    8  32
+3.922139e+06   16  32
+8.505902e+06   32  32
+1.586109e+07   64  32
+2.915572e+07  128  32
+4.360323e+07  256  32
+4.646757e+07  512  32
+4.804819e+07 1024  32
+5.207345e+07 2048  32
+5.450514e+07 4096  32
+5.497340e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.775358e+05    1 256
+9.632153e+05    2 256
+1.050268e+06    4 256
+### CPU: scaling test 32
+9.325368e+05    1  32
+5.475093e+05    2  32
+1.048519e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.655394e+06    1 256
+1.632898e+06    2 256
+1.597165e+06    4 256
+### CPU: scaling test 32
+1.718859e+06    1  32
+1.408389e+06    2  32
+1.612985e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.947611e+06    1 256
+2.989467e+06    2 256
+2.930193e+06    4 256
+### CPU: scaling test 32
+2.413455e+06    1  32
+2.808742e+06    2  32
+2.888087e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.652163e+06    1 256
+3.010862e+06    2 256
+3.216323e+06    4 256
+### CPU: scaling test 32
+2.566570e+06    1  32
+2.745603e+06    2  32
+3.084635e+06    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.854589e+06    1 256
+1.864150e+06    2 256
+1.940552e+06    4 256
+### CPU: scaling test 32
+1.188707e+06    1  32
+1.480762e+06    2  32
+1.728398e+06    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
index 8e00f9820d..61423e69fe 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:55:54
+DATE: 2025-09-24_07:42:27
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.055673e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.658424e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.851508e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.608721e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.979951e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.218009e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.678781 sec
-INFO: No Floating Point Exceptions have been reported
-     2,628,768,348      cycles                           #    2.876 GHz                    
-     4,103,389,790      instructions                     #    1.56  insn per cycle         
-       1.044225431 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 166
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.760428 sec
+     2,903,810,857      cycles                           #    2.835 GHz                       
+     4,637,834,708      instructions                     #    1.60  insn per cycle            
+       1.081616389 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 92
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039901590279E-002
 Relative difference = 7.671454200650844e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.011376e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.175905e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.175905e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.680084e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.118472e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.118472e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.671756 sec
-INFO: No Floating Point Exceptions have been reported
-    19,661,999,702      cycles                           #    2.943 GHz                    
-    46,395,546,050      instructions                     #    2.36  insn per cycle         
-       6.683261433 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  466) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.917089 sec
+    19,877,690,092      cycles                           #    2.872 GHz                       
+    48,645,342,412      instructions                     #    2.45  insn per cycle            
+       6.922363261 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  436) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.631538e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.161697e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.161697e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.456171e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.869473e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.869473e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.308846 sec
-INFO: No Floating Point Exceptions have been reported
-    12,713,127,116      cycles                           #    2.944 GHz                    
-    31,571,564,120      instructions                     #    2.48  insn per cycle         
-       4.322869208 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1731) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.742828 sec
+    13,638,147,605      cycles                           #    2.873 GHz                       
+    35,198,976,108      instructions                     #    2.58  insn per cycle            
+       4.748594895 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1837) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.963768e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.746755e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.746755e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.037647e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.902549e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.902549e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.649356 sec
-INFO: No Floating Point Exceptions have been reported
-    10,294,572,937      cycles                           #    2.814 GHz                    
-    19,586,622,017      instructions                     #    1.90  insn per cycle         
-       3.662289672 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2045) (512y:    0) (512z:    0)
+TOTAL       :     3.506028 sec
+     9,616,164,024      cycles                           #    2.739 GHz                       
+    18,694,922,165      instructions                     #    1.94  insn per cycle            
+       3.511882526 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1921) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.001856e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.818080e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.818080e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.162409e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.131666e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.131666e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.589952 sec
-INFO: No Floating Point Exceptions have been reported
-    10,108,826,304      cycles                           #    2.808 GHz                    
-    19,396,692,714      instructions                     #    1.92  insn per cycle         
-       3.602641354 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1799) (512y:  188) (512z:    0)
+TOTAL       :     3.325331 sec
+     9,147,653,531      cycles                           #    2.747 GHz                       
+    18,304,911,433      instructions                     #    2.00  insn per cycle            
+       3.331056713 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1798) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.801777e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.420597e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.420597e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.529227e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.959559e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.959559e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.939332 sec
-INFO: No Floating Point Exceptions have been reported
-     8,555,878,739      cycles                           #    2.167 GHz                    
-    15,216,666,169      instructions                     #    1.78  insn per cycle         
-       3.951287451 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  966) (512y:  154) (512z: 1330)
+TOTAL       :     4.530804 sec
+     9,205,145,539      cycles                           #    2.030 GHz                       
+    14,289,269,729      instructions                     #    1.55  insn per cycle            
+       4.536419747 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1189) (512y:   40) (512z: 1209)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
index 0283d4438d..baab060c45 100644
--- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum'
 
-DATE: 2024-10-06_08:56:25
+DATE: 2025-09-24_07:43:04
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.048170e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.671940e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.867900e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.499014e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.687427e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.902055e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     0.677955 sec
-INFO: No Floating Point Exceptions have been reported
-     2,610,429,449      cycles                           #    2.847 GHz                    
-     4,074,904,816      instructions                     #    1.56  insn per cycle         
-       1.028610198 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 154
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.769299 sec
+     2,927,220,098      cycles                           #    2.834 GHz                       
+     4,600,691,724      instructions                     #    1.57  insn per cycle            
+       1.091683776 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 52
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 80
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 16
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.282804e-02
 Avg ME (F77/GPU)   = 1.2828039901590279E-002
 Relative difference = 7.671454200650844e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.012794e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.178467e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.178467e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.625912e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.111912e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.111912e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     6.662178 sec
-INFO: No Floating Point Exceptions have been reported
-    19,608,707,308      cycles                           #    2.939 GHz                    
-    46,331,953,932      instructions                     #    2.36  insn per cycle         
-       6.674225175 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  453) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.954919 sec
+    19,977,286,420      cycles                           #    2.871 GHz                       
+    48,643,751,431      instructions                     #    2.43  insn per cycle            
+       6.960548322 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  426) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.631371e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.156116e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.156116e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.457537e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.870054e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.870054e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     4.305744 sec
-INFO: No Floating Point Exceptions have been reported
-    12,687,194,497      cycles                           #    2.940 GHz                    
-    31,570,654,619      instructions                     #    2.49  insn per cycle         
-       4.317357131 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1724) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.739558 sec
+    13,621,642,189      cycles                           #    2.873 GHz                       
+    35,197,065,818      instructions                     #    2.58  insn per cycle            
+       4.744878748 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1827) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039952548879E-002
 Relative difference = 3.6990156841838714e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.951503e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.723168e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.723168e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.047862e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.919113e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.919113e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.669508 sec
-INFO: No Floating Point Exceptions have been reported
-    10,337,023,986      cycles                           #    2.809 GHz                    
-    19,600,398,756      instructions                     #    1.90  insn per cycle         
-       3.680210311 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2036) (512y:    0) (512z:    0)
+TOTAL       :     3.489609 sec
+     9,585,163,191      cycles                           #    2.743 GHz                       
+    18,694,267,994      instructions                     #    1.95  insn per cycle            
+       3.495235656 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1903) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.000628e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.813640e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.813640e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.160459e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.127988e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.127988e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.591164 sec
-INFO: No Floating Point Exceptions have been reported
-    10,093,463,938      cycles                           #    2.804 GHz                    
-    19,298,137,282      instructions                     #    1.91  insn per cycle         
-       3.601580555 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1766) (512y:  191) (512z:    0)
+TOTAL       :     3.326756 sec
+     9,154,692,273      cycles                           #    2.749 GHz                       
+    18,313,995,483      instructions                     #    2.00  insn per cycle            
+       3.332018076 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1778) (512y:   30) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 12 OMP=
+Process                     = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.833398e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.483164e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.483164e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.531353e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.968559e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.968559e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
-TOTAL       :     3.878021 sec
-INFO: No Floating Point Exceptions have been reported
-     8,399,559,009      cycles                           #    2.161 GHz                    
-    15,073,176,103      instructions                     #    1.79  insn per cycle         
-       3.888708235 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  959) (512y:  155) (512z: 1296)
+TOTAL       :     4.525099 sec
+     9,221,463,827      cycles                           #    2.036 GHz                       
+    14,289,434,484      instructions                     #    1.55  insn per cycle            
+       4.530688944 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1169) (512y:   40) (512z: 1209)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 2 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 2 channels { 1 : 256, 2 : 256 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.282804e-02
 Avg ME (F77/C++)    = 1.2828039951670679E-002
 Relative difference = 3.767475112924841e-09
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..ef99d18fee
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:16:54
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+7.891541e+05    1 256
+1.602544e+06    2 256
+3.143940e+06    4 256
+6.117505e+06    8 256
+1.006415e+07   16 256
+1.015524e+07   32 256
+9.869606e+06   64 256
+1.025593e+07  128 256
+1.045621e+07  256 256
+1.042857e+07  512 256
+1.045995e+07 1024 256
+### GPU: scaling test 32
+1.055482e+05    1  32
+2.141363e+05    2  32
+4.117622e+05    4  32
+8.094196e+05    8  32
+1.660171e+06   16  32
+3.251857e+06   32  32
+6.412223e+06   64  32
+1.071040e+07  128  32
+1.040890e+07  256  32
+9.723823e+06  512  32
+1.003785e+07 1024  32
+1.020642e+07 2048  32
+1.030114e+07 4096  32
+1.031146e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.682639e+05    1 256
+1.744753e+05    2 256
+1.760606e+05    4 256
+### CPU: scaling test 32
+1.477098e+05    1  32
+1.544446e+05    2  32
+1.672586e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.878371e+05    1 256
+2.981327e+05    2 256
+2.993239e+05    4 256
+### CPU: scaling test 32
+2.562542e+05    1  32
+2.522585e+05    2  32
+2.636593e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.122900e+05    1 256
+5.152419e+05    2 256
+5.057715e+05    4 256
+### CPU: scaling test 32
+4.742919e+05    1  32
+5.186007e+05    2  32
+5.219122e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.908879e+05    1 256
+4.835691e+05    2 256
+5.081233e+05    4 256
+### CPU: scaling test 32
+5.020002e+05    1  32
+4.694044e+05    2  32
+5.132071e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.468816e+05    1 256
+3.430523e+05    2 256
+3.448120e+05    4 256
+### CPU: scaling test 32
+3.438642e+05    1  32
+3.306639e+05    2  32
+3.398922e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
index 0abecbd859..12554813ea 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:57:50
+DATE: 2025-09-24_07:44:46
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.424562e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.378226e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.000814e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.404580e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049333e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056492e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.532719 sec
-INFO: No Floating Point Exceptions have been reported
-     2,198,564,055      cycles                           #    2.860 GHz                    
-     3,137,529,593      instructions                     #    1.43  insn per cycle         
-       0.850854779 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.628724 sec
+     2,721,117,292      cycles                           #    2.810 GHz                       
+     4,339,502,991      instructions                     #    1.59  insn per cycle            
+       1.033254992 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.821542e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.869016e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.869016e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.729417e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.772089e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.772089e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.902388 sec
-INFO: No Floating Point Exceptions have been reported
-    17,373,663,633      cycles                           #    2.939 GHz                    
-    46,051,346,456      instructions                     #    2.65  insn per cycle         
-       5.916149203 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.171630 sec
+    17,456,416,634      cycles                           #    2.826 GHz                       
+    46,971,345,188      instructions                     #    2.69  insn per cycle            
+       6.177518078 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.199984e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.364044e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.364044e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.953599e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.091735e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.091735e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.425369 sec
-INFO: No Floating Point Exceptions have been reported
-    10,116,123,100      cycles                           #    2.945 GHz                    
-    27,968,506,728      instructions                     #    2.76  insn per cycle         
-       3.436971917 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.659325 sec
+    10,530,690,716      cycles                           #    2.874 GHz                       
+    29,287,857,198      instructions                     #    2.78  insn per cycle            
+       3.665244431 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.021241e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.422127e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.422127e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.768889e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.129034e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.129034e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.236686 sec
-INFO: No Floating Point Exceptions have been reported
-     6,226,726,050      cycles                           #    2.773 GHz                    
-    12,700,169,832      instructions                     #    2.04  insn per cycle         
-       2.249020906 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
+TOTAL       :     2.307581 sec
+     6,215,258,548      cycles                           #    2.688 GHz                       
+    12,525,382,616      instructions                     #    2.02  insn per cycle            
+       2.313431050 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2736) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.518459e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.996461e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.996461e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.044890e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.444572e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.444572e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.046768 sec
-INFO: No Floating Point Exceptions have been reported
-     5,709,909,658      cycles                           #    2.777 GHz                    
-    12,140,194,379      instructions                     #    2.13  insn per cycle         
-       2.059786524 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.187457 sec
+     5,882,745,324      cycles                           #    2.683 GHz                       
+    12,185,179,748      instructions                     #    2.07  insn per cycle            
+       2.193247103 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2586) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.403513e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.583329e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.583329e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.306178e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.469286e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.469286e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.229693 sec
-INFO: No Floating Point Exceptions have been reported
-     6,051,702,488      cycles                           #    1.869 GHz                    
-     8,428,750,265      instructions                     #    1.39  insn per cycle         
-       3.242969033 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.279663 sec
+     5,830,954,541      cycles                           #    1.775 GHz                       
+     7,895,679,026      instructions                     #    1.35  insn per cycle            
+       3.285445110 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1901)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..c8cf787c12
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:36:22
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.202700e+05    1 256
+4.144491e+05    2 256
+8.179658e+05    4 256
+1.562332e+06    8 256
+2.892629e+06   16 256
+4.298762e+06   32 256
+4.894852e+06   64 256
+5.444492e+06  128 256
+5.683171e+06  256 256
+5.820623e+06  512 256
+5.919521e+06 1024 256
+### GPU: scaling test 32
+2.776014e+04    1  32
+5.644799e+04    2  32
+1.100256e+05    4  32
+2.252125e+05    8  32
+4.345424e+05   16  32
+8.321232e+05   32  32
+1.586046e+06   64  32
+2.908073e+06  128  32
+4.301308e+06  256  32
+4.913299e+06  512  32
+5.390513e+06 1024  32
+5.633664e+06 2048  32
+5.787401e+06 4096  32
+5.878205e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.696941e+05    1 256
+1.717741e+05    2 256
+1.790509e+05    4 256
+### CPU: scaling test 32
+1.689002e+05    1  32
+1.571127e+05    2  32
+1.541879e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.839038e+05    1 256
+2.913499e+05    2 256
+2.993763e+05    4 256
+### CPU: scaling test 32
+2.509115e+05    1  32
+2.754181e+05    2  32
+2.843895e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.154235e+05    1 256
+5.028185e+05    2 256
+4.765561e+05    4 256
+### CPU: scaling test 32
+4.730649e+05    1  32
+5.173097e+05    2  32
+4.703842e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.070823e+05    1 256
+5.471126e+05    2 256
+5.238208e+05    4 256
+### CPU: scaling test 32
+5.417393e+05    1  32
+5.505282e+05    2  32
+5.522550e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.473801e+05    1 256
+3.466467e+05    2 256
+3.476546e+05    4 256
+### CPU: scaling test 32
+3.427702e+05    1  32
+3.255838e+05    2  32
+3.435098e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..5bcf17d672
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_blasOn.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:31:05
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 5.699751e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.094514e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.118806e+06                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     1.755908 sec
+     5,198,972,304      cycles                           #    2.815 GHz                       
+     7,555,850,717      instructions                     #    1.45  insn per cycle            
+       2.202249998 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.758926e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.803118e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.803118e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     6.070096 sec
+    17,462,788,649      cycles                           #    2.875 GHz                       
+    46,971,670,398      instructions                     #    2.69  insn per cycle            
+       6.075826013 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 2.950618e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.089205e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.089205e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.663339 sec
+    10,527,655,944      cycles                           #    2.870 GHz                       
+    29,286,438,599      instructions                     #    2.78  insn per cycle            
+       3.668929376 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.773688e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.129278e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.129278e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.305290 sec
+     6,201,815,154      cycles                           #    2.685 GHz                       
+    12,525,642,665      instructions                     #    2.02  insn per cycle            
+       2.310979411 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2736) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.054242e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.447760e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.447760e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.182991 sec
+     5,886,113,793      cycles                           #    2.691 GHz                       
+    12,185,152,362      instructions                     #    2.07  insn per cycle            
+       2.188678888 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2586) (512y:   47) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.302489e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.466219e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.466219e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.284220 sec
+     5,848,328,873      cycles                           #    1.779 GHz                       
+     7,896,060,647      instructions                     #    1.35  insn per cycle            
+       3.290064278 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1901)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
index 0a62f31f21..9191103607 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,252 +10,220 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:37:36
+DATE: 2025-09-24_08:55:18
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.523249e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.008578e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.008578e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.122019e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.976634e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.976634e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.943118 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,438,006,415      cycles                           #    2.887 GHz                    
-     4,812,518,572      instructions                     #    1.40  insn per cycle         
-       1.248014993 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     0.933427 sec
+     3,692,889,749      cycles                           #    2.838 GHz                       
+     6,132,016,544      instructions                     #    1.66  insn per cycle            
+       1.359401129 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.806787e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.852935e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.852935e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.747683e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.792018e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.792018e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     6.028463 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    17,701,660,691      cycles                           #    2.931 GHz                    
-    46,100,592,443      instructions                     #    2.60  insn per cycle         
-       6.041454793 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.189057 sec
+    17,795,188,257      cycles                           #    2.873 GHz                       
+    47,029,255,433      instructions                     #    2.64  insn per cycle            
+       6.196117113 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.171570e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.328412e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.328412e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.921364e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.058184e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.058184e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.537488 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,436,410,766      cycles                           #    2.940 GHz                    
-    28,150,415,987      instructions                     #    2.70  insn per cycle         
-       3.550700440 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.779219 sec
+    10,873,140,033      cycles                           #    2.873 GHz                       
+    29,459,756,408      instructions                     #    2.71  insn per cycle            
+       3.786041909 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.940586e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.316252e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.316252e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.690469e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.034578e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.034578e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.355700 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,586,554,223      cycles                           #    2.781 GHz                    
-    12,999,619,553      instructions                     #    1.97  insn per cycle         
-       2.369192751 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
+TOTAL       :     2.428734 sec
+     6,544,737,755      cycles                           #    2.688 GHz                       
+    12,802,283,348      instructions                     #    1.96  insn per cycle            
+       2.435745600 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2736) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.425137e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.877080e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.877080e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.968126e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.352687e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.352687e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.160954 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,058,497,746      cycles                           #    2.788 GHz                    
-    12,422,408,910      instructions                     #    2.05  insn per cycle         
-       2.174009213 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.302326 sec
+     6,222,182,781      cycles                           #    2.696 GHz                       
+    12,460,650,523      instructions                     #    2.00  insn per cycle            
+       2.309301530 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2586) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.454260e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.633384e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.633384e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.258803e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.419460e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.419460e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.271770 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,220,081,356      cycles                           #    1.894 GHz                    
-     8,655,636,644      instructions                     #    1.39  insn per cycle         
-       3.285127387 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.410893 sec
+     6,198,364,555      cycles                           #    1.814 GHz                       
+     8,129,999,388      instructions                     #    1.31  insn per cycle            
+       3.417828859 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1901)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
index 70d02af695..04fe28e8ae 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:49:47
+DATE: 2025-09-24_09:14:34
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.202403e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.187841e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.877468e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.237181e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.048925e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.057217e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     0.637594 sec
-INFO: No Floating Point Exceptions have been reported
-     2,481,390,363      cycles                           #    2.852 GHz                    
-     3,619,998,982      instructions                     #    1.46  insn per cycle         
-       0.928734017 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.728904 sec
+     3,064,102,873      cycles                           #    2.835 GHz                       
+     4,857,097,227      instructions                     #    1.59  insn per cycle            
+       1.137267130 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.808108e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.854363e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.854363e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.758915e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.803139e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.803139e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     5.981515 sec
-INFO: No Floating Point Exceptions have been reported
-    17,441,882,337      cycles                           #    2.914 GHz                    
-    45,980,812,555      instructions                     #    2.64  insn per cycle         
-       5.987317462 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.130886 sec
+    17,629,078,696      cycles                           #    2.874 GHz                       
+    46,986,708,443      instructions                     #    2.67  insn per cycle            
+       6.136336124 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.173867e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.332553e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.332553e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.948603e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.087576e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.087576e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.490197 sec
-INFO: No Floating Point Exceptions have been reported
-    10,215,611,800      cycles                           #    2.923 GHz                    
-    27,889,324,001      instructions                     #    2.73  insn per cycle         
-       3.495993800 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.725904 sec
+    10,708,180,208      cycles                           #    2.871 GHz                       
+    29,285,168,892      instructions                     #    2.73  insn per cycle            
+       3.731175047 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.999819e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.389873e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.389873e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.743537e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.097973e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.097973e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.281339 sec
-INFO: No Floating Point Exceptions have been reported
-     6,287,168,374      cycles                           #    2.750 GHz                    
-    12,602,929,813      instructions                     #    2.00  insn per cycle         
-       2.287435325 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
+TOTAL       :     2.381248 sec
+     6,381,646,347      cycles                           #    2.676 GHz                       
+    12,508,043,189      instructions                     #    1.96  insn per cycle            
+       2.386552428 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2736) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.471434e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.936245e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.936245e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.071295e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.472287e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.472287e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     2.097327 sec
-INFO: No Floating Point Exceptions have been reported
-     5,814,420,150      cycles                           #    2.765 GHz                    
-    11,994,829,914      instructions                     #    2.06  insn per cycle         
-       2.103345298 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.238244 sec
+     6,056,118,225      cycles                           #    2.701 GHz                       
+    12,131,985,515      instructions                     #    2.00  insn per cycle            
+       2.243511255 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2586) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.462865e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.641783e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.641783e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.313116e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.477716e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.477716e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079401e+00 +- 3.402993e-03 )  GeV^0
-TOTAL       :     3.218108 sec
-INFO: No Floating Point Exceptions have been reported
-     5,937,437,503      cycles                           #    1.843 GHz                    
-     8,290,568,638      instructions                     #    1.40  insn per cycle         
-       3.224462086 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.336612 sec
+     6,011,176,733      cycles                           #    1.799 GHz                       
+     7,844,203,228      instructions                     #    1.30  insn per cycle            
+       3.341856637 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1901)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
index 794a3c9310..f10f1223e6 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:46:56
+DATE: 2025-09-24_09:10:28
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.311257e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.342288e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.004457e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.236174e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049477e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.057927e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.566605 sec
-INFO: No Floating Point Exceptions have been reported
-     2,313,605,054      cycles                           #    2.893 GHz                    
-     3,600,350,267      instructions                     #    1.56  insn per cycle         
-       0.856648834 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.664857 sec
+     2,876,847,918      cycles                           #    2.837 GHz                       
+     4,812,114,039      instructions                     #    1.67  insn per cycle            
+       1.074444340 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.824387e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.871256e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.871256e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.756610e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.801195e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.801195e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.855309 sec
-INFO: No Floating Point Exceptions have been reported
-    17,230,682,954      cycles                           #    2.940 GHz                    
-    45,932,528,772      instructions                     #    2.67  insn per cycle         
-       5.861424268 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.077308 sec
+    17,463,617,999      cycles                           #    2.872 GHz                       
+    46,970,827,220      instructions                     #    2.69  insn per cycle            
+       6.082669851 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.215073e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.378302e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.378302e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.951168e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.089979e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.089979e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.372268 sec
-INFO: No Floating Point Exceptions have been reported
-     9,959,367,668      cycles                           #    2.949 GHz                    
-    27,848,270,798      instructions                     #    2.80  insn per cycle         
-       3.378265573 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.661548 sec
+    10,532,998,621      cycles                           #    2.873 GHz                       
+    29,286,182,285      instructions                     #    2.78  insn per cycle            
+       3.667011568 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.999546e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.391220e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.391220e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.769638e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.128119e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.128119e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.206484 sec
-INFO: No Floating Point Exceptions have been reported
-     6,113,930,208      cycles                           #    2.765 GHz                    
-    12,581,849,902      instructions                     #    2.06  insn per cycle         
-       2.212402360 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
+TOTAL       :     2.306780 sec
+     6,205,424,366      cycles                           #    2.685 GHz                       
+    12,524,741,443      instructions                     #    2.02  insn per cycle            
+       2.312117670 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2736) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.516180e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.984165e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.984165e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.071431e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.470140e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.470140e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.007710 sec
-INFO: No Floating Point Exceptions have been reported
-     5,576,628,773      cycles                           #    2.771 GHz                    
-    12,020,299,868      instructions                     #    2.16  insn per cycle         
-       2.013581558 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.175899 sec
+     5,871,123,047      cycles                           #    2.693 GHz                       
+    12,184,260,539      instructions                     #    2.08  insn per cycle            
+       2.181215362 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2586) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.502286e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.687963e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.687963e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.314814e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.482786e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.482786e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.102138 sec
-INFO: No Floating Point Exceptions have been reported
-     5,751,986,200      cycles                           #    1.852 GHz                    
-     8,297,969,466      instructions                     #    1.44  insn per cycle         
-       3.107697215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.271642 sec
+     5,832,726,143      cycles                           #    1.781 GHz                       
+     7,894,703,614      instructions                     #    1.35  insn per cycle            
+       3.276923395 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1901)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..886c892c14
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_noBlas.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_09:20:18
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 9.244710e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.048259e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056627e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     0.636988 sec
+     2,686,906,350      cycles                           #    2.813 GHz                       
+     4,300,779,901      instructions                     #    1.60  insn per cycle            
+       1.016142360 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.759819e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.803941e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.803941e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     6.066107 sec
+    17,450,230,221      cycles                           #    2.875 GHz                       
+    46,970,996,393      instructions                     #    2.69  insn per cycle            
+       6.071702693 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 2.945444e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.084219e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.084219e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.668722 sec
+    10,531,909,043      cycles                           #    2.867 GHz                       
+    29,285,880,648      instructions                     #    2.78  insn per cycle            
+       3.673985883 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.776361e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.141048e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.141048e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.303397 sec
+     6,215,739,916      cycles                           #    2.693 GHz                       
+    12,525,182,351      instructions                     #    2.02  insn per cycle            
+       2.308825772 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2736) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.057506e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.454426e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.454426e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.182222 sec
+     5,875,940,555      cycles                           #    2.687 GHz                       
+    12,184,942,026      instructions                     #    2.07  insn per cycle            
+       2.187641932 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2586) (512y:   47) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.310786e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.476862e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.476862e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.275530 sec
+     5,832,927,122      cycles                           #    1.779 GHz                       
+     7,895,012,180      instructions                     #    1.35  insn per cycle            
+       3.281078765 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1901)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063388516204
+Relative difference = 3.2588037186351226e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
index 70a45db399..0f1925f64d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,235 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:44:10
+DATE: 2025-09-24_09:06:37
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.785807e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.291280e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.973584e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.776809e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049035e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.057384e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.717756 sec
-INFO: No Floating Point Exceptions have been reported
-     2,755,914,027      cycles                           #    2.900 GHz                    
-     4,368,405,962      instructions                     #    1.59  insn per cycle         
-       1.007006361 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     0.825846 sec
+     3,321,633,294      cycles                           #    2.833 GHz                       
+     5,533,137,328      instructions                     #    1.67  insn per cycle            
+       1.233631459 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.829948e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.877608e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.877608e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.759080e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.803410e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.803410e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.839744 sec
-INFO: No Floating Point Exceptions have been reported
-    17,231,514,699      cycles                           #    2.948 GHz                    
-    45,931,758,909      instructions                     #    2.67  insn per cycle         
-       5.845651027 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.069385 sec
+    17,452,545,633      cycles                           #    2.874 GHz                       
+    46,970,678,458      instructions                     #    2.69  insn per cycle            
+       6.074915326 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.215717e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.376174e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.376174e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.952210e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.090467e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.090467e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.370523 sec
-INFO: No Floating Point Exceptions have been reported
-     9,939,666,586      cycles                           #    2.945 GHz                    
-    27,847,302,489      instructions                     #    2.80  insn per cycle         
-       3.376515027 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.660961 sec
+    10,528,744,959      cycles                           #    2.872 GHz                       
+    29,285,969,565      instructions                     #    2.78  insn per cycle            
+       3.666466666 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2918) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.058902e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.451650e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.451650e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.744356e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.100853e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.100853e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.181386 sec
-INFO: No Floating Point Exceptions have been reported
-     6,074,037,919      cycles                           #    2.778 GHz                    
-    12,580,567,087      instructions                     #    2.07  insn per cycle         
-       2.187203017 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2619) (512y:    0) (512z:    0)
+TOTAL       :     2.319004 sec
+     6,208,558,879      cycles                           #    2.672 GHz                       
+    12,525,519,644      instructions                     #    2.02  insn per cycle            
+       2.324628512 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2736) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.484469e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.947491e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.947491e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.050195e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.444774e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.444774e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.020942 sec
-INFO: No Floating Point Exceptions have been reported
-     5,589,694,694      cycles                           #    2.759 GHz                    
-    12,020,772,424      instructions                     #    2.15  insn per cycle         
-       2.026934215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  144) (512z:    0)
+TOTAL       :     2.184551 sec
+     5,886,818,346      cycles                           #    2.689 GHz                       
+    12,182,424,074      instructions                     #    2.07  insn per cycle            
+       2.189855469 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2586) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.541083e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.728456e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.728456e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.306826e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.470997e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.470997e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.072814 sec
-INFO: No Floating Point Exceptions have been reported
-     5,724,538,871      cycles                           #    1.860 GHz                    
-     8,297,304,281      instructions                     #    1.45  insn per cycle         
-       3.079169559 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1441) (512y:  122) (512z: 1802)
+TOTAL       :     3.278572 sec
+     5,842,492,465      cycles                           #    1.780 GHz                       
+     7,895,231,616      instructions                     #    1.35  insn per cycle            
+       3.283921228 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1901)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
index 03be4a726d..acb6158b6b 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:58:15
+DATE: 2025-09-24_07:45:19
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.508928e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.321752e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.002344e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.401233e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049117e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056279e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.536365 sec
-INFO: No Floating Point Exceptions have been reported
-     2,214,194,265      cycles                           #    2.876 GHz                    
-     3,152,115,430      instructions                     #    1.42  insn per cycle         
-       0.834564895 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.624817 sec
+     2,770,804,330      cycles                           #    2.829 GHz                       
+     4,445,203,695      instructions                     #    1.60  insn per cycle            
+       1.036109264 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 98
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.855453e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.904405e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.904405e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.817113e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.864515e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.864515e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.800560 sec
-INFO: No Floating Point Exceptions have been reported
-    16,903,949,090      cycles                           #    2.909 GHz                    
-    45,043,853,273      instructions                     #    2.66  insn per cycle         
-       5.813534817 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.879296 sec
+    16,915,750,663      cycles                           #    2.875 GHz                       
+    45,827,908,987      instructions                     #    2.71  insn per cycle            
+       5.885040141 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  633) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515649
 Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.339712e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.518637e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.518637e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.964734e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.104969e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.104969e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.288467 sec
-INFO: No Floating Point Exceptions have been reported
-     9,645,043,566      cycles                           #    2.925 GHz                    
-    26,807,862,552      instructions                     #    2.78  insn per cycle         
-       3.301069690 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2327) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.645473 sec
+    10,495,372,754      cycles                           #    2.875 GHz                       
+    29,278,170,489      instructions                     #    2.79  insn per cycle            
+       3.651373133 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2913) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388515654
 Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.590385e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.923511e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.923511e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.763804e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.118520e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.118520e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.431911 sec
-INFO: No Floating Point Exceptions have been reported
-     6,762,097,168      cycles                           #    2.769 GHz                    
-    14,239,182,198      instructions                     #    2.11  insn per cycle         
-       2.443454156 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2711) (512y:    0) (512z:    0)
+TOTAL       :     2.309471 sec
+     6,198,243,265      cycles                           #    2.678 GHz                       
+    12,520,312,586      instructions                     #    2.02  insn per cycle            
+       2.315314098 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2724) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.784038e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.137564e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.137564e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.036657e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.429513e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.429513e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.339078 sec
-INFO: No Floating Point Exceptions have been reported
-     6,493,835,738      cycles                           #    2.765 GHz                    
-    13,835,177,964      instructions                     #    2.13  insn per cycle         
-       2.350490634 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2360) (512y:  298) (512z:    0)
+TOTAL       :     2.189307 sec
+     5,886,882,915      cycles                           #    2.683 GHz                       
+    12,178,604,730      instructions                     #    2.07  insn per cycle            
+       2.195089043 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2572) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.400894e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.576119e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.576119e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.276480e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.442165e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.442165e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.231977 sec
-INFO: No Floating Point Exceptions have been reported
-     6,054,126,925      cycles                           #    1.868 GHz                    
-    10,181,313,288      instructions                     #    1.68  insn per cycle         
-       3.245420113 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1273) (512y:  208) (512z: 1988)
+TOTAL       :     3.310853 sec
+     5,849,370,568      cycles                           #    1.766 GHz                       
+     7,892,630,392      instructions                     #    1.35  insn per cycle            
+       3.316630553 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1503) (512y:   65) (512z: 1901)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
index f94c1448dd..892c6e6f77 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:28:32
+DATE: 2025-09-24_08:45:57
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.445619e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.389644e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.998797e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.203899e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049403e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.057753e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.532166 sec
-INFO: No Floating Point Exceptions have been reported
-     2,223,705,741      cycles                           #    2.888 GHz                    
-     3,137,862,648      instructions                     #    1.41  insn per cycle         
-       0.826622030 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.628394 sec
+     2,751,584,432      cycles                           #    2.831 GHz                       
+     4,342,569,960      instructions                     #    1.58  insn per cycle            
+       1.032472366 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.243473e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.316891e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.316891e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.884618e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.936373e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.936373e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.822701 sec
-INFO: No Floating Point Exceptions have been reported
-    14,262,425,677      cycles                           #    2.951 GHz                    
-    34,462,229,045      instructions                     #    2.42  insn per cycle         
-       4.834685593 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  665) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.672505 sec
+    16,281,774,044      cycles                           #    2.868 GHz                       
+    42,306,769,598      instructions                     #    2.60  insn per cycle            
+       5.678252169 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  714) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515649
-Relative difference = 3.258803992249869e-07
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.991823e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.134338e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.134338e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.119000e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.273875e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.273875e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.653458 sec
-INFO: No Floating Point Exceptions have been reported
-    10,828,452,798      cycles                           #    2.955 GHz                    
-    24,364,594,695      instructions                     #    2.25  insn per cycle         
-       3.665357624 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2610) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.470530 sec
+     9,977,147,210      cycles                           #    2.871 GHz                       
+    25,847,622,241      instructions                     #    2.59  insn per cycle            
+       3.476158258 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2979) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515654
-Relative difference = 3.2588039900609506e-07
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.588361e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.923011e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.923011e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.125204e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.540174e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.540174e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.432860 sec
-INFO: No Floating Point Exceptions have been reported
-     6,763,126,248      cycles                           #    2.768 GHz                    
-    12,520,790,366      instructions                     #    1.85  insn per cycle         
-       2.444836798 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3115) (512y:    0) (512z:    0)
+TOTAL       :     2.154277 sec
+     5,808,180,817      cycles                           #    2.691 GHz                       
+    10,998,718,090      instructions                     #    1.89  insn per cycle            
+       2.160064381 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2666) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.983949e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.371900e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.371900e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.604062e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.095977e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.095977e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.251146 sec
-INFO: No Floating Point Exceptions have been reported
-     6,291,656,449      cycles                           #    2.782 GHz                    
-    11,662,894,163      instructions                     #    1.85  insn per cycle         
-       2.263135736 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2644) (512y:  239) (512z:    0)
+TOTAL       :     1.978924 sec
+     5,298,127,046      cycles                           #    2.671 GHz                       
+    10,086,761,200      instructions                     #    1.90  insn per cycle            
+       1.984504704 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2375) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.728872e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.941749e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.941749e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.577753e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.771603e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.771603e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.960781 sec
-INFO: No Floating Point Exceptions have been reported
-     5,563,913,804      cycles                           #    1.872 GHz                    
-     9,412,295,126      instructions                     #    1.69  insn per cycle         
-       2.972906161 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2099) (512y:  282) (512z: 1958)
+TOTAL       :     3.039803 sec
+     5,439,957,246      cycles                           #    1.787 GHz                       
+     6,988,231,171      instructions                     #    1.28  insn per cycle            
+       3.045430879 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1363) (512y:   57) (512z: 1812)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
index 3c1647789f..9240e55d7f 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:28:57
+DATE: 2025-09-24_08:46:29
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.391002e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.323919e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.976474e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.185938e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.045496e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053733e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.534058 sec
-INFO: No Floating Point Exceptions have been reported
-     2,225,875,951      cycles                           #    2.883 GHz                    
-     3,143,824,990      instructions                     #    1.41  insn per cycle         
-       0.828954123 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.627663 sec
+     2,749,687,654      cycles                           #    2.830 GHz                       
+     4,365,506,275      instructions                     #    1.59  insn per cycle            
+       1.031393421 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 98
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063388516822
-Relative difference = 3.2588034143755247e-07
+Avg ME (F77/GPU)   = 2.0288063388516817
+Relative difference = 3.258803416564443e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.586147e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.682611e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.682611e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.884133e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.935469e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.935469e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     4.200138 sec
-INFO: No Floating Point Exceptions have been reported
-    12,457,576,414      cycles                           #    2.958 GHz                    
-    35,030,140,380      instructions                     #    2.81  insn per cycle         
-       4.211834896 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  430) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.673558 sec
+    16,293,550,728      cycles                           #    2.870 GHz                       
+    41,861,740,674      instructions                     #    2.57  insn per cycle            
+       5.679192911 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  623) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515649
-Relative difference = 3.258803992249869e-07
+Avg ME (F77/C++)    = 2.0288063388515654
+Relative difference = 3.2588039900609506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.003695e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.145378e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.145378e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.151299e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.310989e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.310989e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.637171 sec
-INFO: No Floating Point Exceptions have been reported
-    10,771,658,335      cycles                           #    2.953 GHz                    
-    23,459,809,146      instructions                     #    2.18  insn per cycle         
-       3.648522280 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2378) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.435956 sec
+     9,881,633,737      cycles                           #    2.872 GHz                       
+    25,645,447,655      instructions                     #    2.60  insn per cycle            
+       3.441581401 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2694) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
-Avg ME (F77/C++)    = 2.0288063388515654
-Relative difference = 3.2588039900609506e-07
+Avg ME (F77/C++)    = 2.0288063388515649
+Relative difference = 3.258803992249869e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.029039e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.423785e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.423785e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.173928e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.603025e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.603025e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.231082 sec
-INFO: No Floating Point Exceptions have been reported
-     6,224,358,348      cycles                           #    2.777 GHz                    
-    11,980,138,777      instructions                     #    1.92  insn per cycle         
-       2.242426635 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2468) (512y:    0) (512z:    0)
+TOTAL       :     2.135077 sec
+     5,748,087,049      cycles                           #    2.686 GHz                       
+    10,787,985,756      instructions                     #    1.88  insn per cycle            
+       2.140683671 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2293) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.044695e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.439218e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.439218e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.811256e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.348034e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.348034e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.224952 sec
-INFO: No Floating Point Exceptions have been reported
-     6,216,689,838      cycles                           #    2.781 GHz                    
-    11,219,235,507      instructions                     #    1.80  insn per cycle         
-       2.236216110 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2098) (512y:  174) (512z:    0)
+TOTAL       :     1.912055 sec
+     5,134,706,346      cycles                           #    2.679 GHz                       
+     9,902,833,377      instructions                     #    1.93  insn per cycle            
+       1.917709298 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2034) (512y:   40) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.888626e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.118349e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.118349e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.596860e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.795893e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.795893e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.842958 sec
-INFO: No Floating Point Exceptions have been reported
-     5,376,391,405      cycles                           #    1.885 GHz                    
-     9,136,626,879      instructions                     #    1.70  insn per cycle         
-       2.854254782 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1632) (512y:  208) (512z: 1567)
+TOTAL       :     3.024884 sec
+     5,382,065,012      cycles                           #    1.777 GHz                       
+     6,925,116,980      instructions                     #    1.29  insn per cycle            
+       3.030667974 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  993) (512y:   67) (512z: 1602)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063388516204
 Relative difference = 3.2588037186351226e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..943c404205
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:17:36
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+8.141613e+05    1 256
+1.655634e+06    2 256
+3.173708e+06    4 256
+6.232407e+06    8 256
+1.307682e+07   16 256
+1.882855e+07   32 256
+1.945238e+07   64 256
+2.009646e+07  128 256
+2.124544e+07  256 256
+2.154722e+07  512 256
+2.149132e+07 1024 256
+### GPU: scaling test 32
+1.045882e+05    1  32
+2.121559e+05    2  32
+4.291687e+05    4  32
+8.027620e+05    8  32
+1.602399e+06   16  32
+3.186378e+06   32  32
+6.087104e+06   64  32
+1.334324e+07  128  32
+1.760445e+07  256  32
+1.935350e+07  512  32
+1.987098e+07 1024  32
+2.043714e+07 2048  32
+2.068730e+07 4096  32
+2.072528e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.705683e+05    1 256
+1.855453e+05    2 256
+1.816065e+05    4 256
+### CPU: scaling test 32
+1.630083e+05    1  32
+1.607148e+05    2  32
+1.671437e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.067469e+05    1 256
+3.890007e+05    2 256
+4.167840e+05    4 256
+### CPU: scaling test 32
+3.472185e+05    1  32
+3.961475e+05    2  32
+3.940680e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.895313e+05    1 256
+9.759556e+05    2 256
+9.116339e+05    4 256
+### CPU: scaling test 32
+8.794592e+05    1  32
+8.459902e+05    2  32
+8.278360e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.033965e+05    1 256
+1.027122e+06    2 256
+9.561667e+05    4 256
+### CPU: scaling test 32
+9.164853e+05    1  32
+8.304139e+05    2  32
+8.973388e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.131913e+05    1 256
+7.291350e+05    2 256
+7.116716e+05    4 256
+### CPU: scaling test 32
+6.108036e+05    1  32
+5.039251e+05    2  32
+7.225230e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
index eed598e900..8748bfad0d 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:59:31
+DATE: 2025-09-24_07:47:03
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.348925e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.730429e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.847126e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.848748e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.153247e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.172236e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.489368 sec
-INFO: No Floating Point Exceptions have been reported
-     2,066,464,716      cycles                           #    2.888 GHz                    
-     2,966,218,976      instructions                     #    1.44  insn per cycle         
-       0.775358949 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.538693 sec
+     2,322,514,304      cycles                           #    2.829 GHz                       
+     3,448,322,190      instructions                     #    1.48  insn per cycle            
+       0.878218682 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.920704e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.976809e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.976809e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.829942e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.880039e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.880039e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.564608 sec
-INFO: No Floating Point Exceptions have been reported
-    16,407,008,301      cycles                           #    2.946 GHz                    
-    45,390,324,197      instructions                     #    2.77  insn per cycle         
-       5.572247633 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.816771 sec
+    16,702,274,132      cycles                           #    2.869 GHz                       
+    46,675,030,384      instructions                     #    2.79  insn per cycle            
+       5.822310848 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.527362e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.867119e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.867119e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.095384e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.377755e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.377755e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.418751 sec
-INFO: No Floating Point Exceptions have been reported
-     7,148,582,676      cycles                           #    2.947 GHz                    
-    17,841,430,692      instructions                     #    2.50  insn per cycle         
-       2.426747092 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.648167 sec
+     7,618,074,541      cycles                           #    2.872 GHz                       
+    18,461,358,812      instructions                     #    2.42  insn per cycle            
+       2.653697468 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.351940e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.517580e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.517580e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.445048e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.627798e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.627798e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.358362 sec
-INFO: No Floating Point Exceptions have been reported
-     3,812,563,399      cycles                           #    2.792 GHz                    
-     8,312,155,726      instructions                     #    2.18  insn per cycle         
-       1.366469053 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.328918 sec
+     3,566,458,202      cycles                           #    2.674 GHz                       
+     7,631,838,956      instructions                     #    2.14  insn per cycle            
+       1.334330898 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3260) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.799220e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.010674e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.010674e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.867670e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.017863e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.017863e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.295301 sec
-INFO: No Floating Point Exceptions have been reported
-     3,622,174,398      cycles                           #    2.781 GHz                    
-     7,961,498,247      instructions                     #    2.20  insn per cycle         
-       1.303182368 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.270218 sec
+     3,422,297,214      cycles                           #    2.684 GHz                       
+     7,470,055,154      instructions                     #    2.18  insn per cycle            
+       1.275737279 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3167) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.500324e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.161825e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.161825e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.754154e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.481927e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.481927e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.717843 sec
-INFO: No Floating Point Exceptions have been reported
-     3,332,199,340      cycles                           #    1.933 GHz                    
-     6,146,454,565      instructions                     #    1.84  insn per cycle         
-       1.725889754 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.639571 sec
+     3,053,224,009      cycles                           #    1.857 GHz                       
+     5,249,590,881      instructions                     #    1.72  insn per cycle            
+       1.645301083 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2089)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183148950338
-Relative difference = 1.5521108056421764e-07
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..9386878840
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:37:44
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.582527e+05    1 256
+5.042869e+05    2 256
+1.010578e+06    4 256
+1.791732e+06    8 256
+3.138159e+06   16 256
+5.095367e+06   32 256
+7.254867e+06   64 256
+7.872948e+06  128 256
+8.393327e+06  256 256
+8.771099e+06  512 256
+8.880832e+06 1024 256
+### GPU: scaling test 32
+3.284527e+04    1  32
+6.671051e+04    2  32
+1.301834e+05    4  32
+2.576938e+05    8  32
+5.019726e+05   16  32
+9.673016e+05   32  32
+1.768771e+06   64  32
+3.070386e+06  128  32
+5.228071e+06  256  32
+7.087152e+06  512  32
+7.969015e+06 1024  32
+8.347439e+06 2048  32
+8.663041e+06 4096  32
+8.808998e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.766619e+05    1 256
+1.809926e+05    2 256
+1.845127e+05    4 256
+### CPU: scaling test 32
+1.630432e+05    1  32
+1.614893e+05    2  32
+1.640721e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.608205e+05    1 256
+3.947164e+05    2 256
+4.206070e+05    4 256
+### CPU: scaling test 32
+3.385312e+05    1  32
+3.602610e+05    2  32
+3.674835e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.161246e+05    1 256
+9.657298e+05    2 256
+9.014997e+05    4 256
+### CPU: scaling test 32
+8.745320e+05    1  32
+9.016498e+05    2  32
+9.102870e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.628911e+05    1 256
+9.505774e+05    2 256
+9.477989e+05    4 256
+### CPU: scaling test 32
+7.395424e+05    1  32
+9.462837e+05    2  32
+9.578186e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.190607e+05    1 256
+7.431869e+05    2 256
+7.450388e+05    4 256
+### CPU: scaling test 32
+6.064167e+05    1  32
+6.801637e+05    2  32
+5.067120e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..c87245821e
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_blasOn.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:32:36
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 8.578502e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.049786e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.079458e+06                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
+TOTAL       :     1.303965 sec
+     4,770,461,480      cycles                           #    2.840 GHz                       
+     6,757,920,419      instructions                     #    1.42  insn per cycle            
+       1.740408679 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028811e+00
+Avg ME (F77/GPU)   = 2.0288499532034621
+Relative difference = 1.920001590188648e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.833085e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.883191e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.883191e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
+TOTAL       :     5.806781 sec
+    16,698,080,338      cycles                           #    2.874 GHz                       
+    46,673,842,711      instructions                     #    2.80  insn per cycle            
+       5.812048935 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  664) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028820e+00
+Avg ME (F77/C++)    = 2.0288198669441044
+Relative difference = 6.558289825352968e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.100807e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.380739e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.380739e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
+TOTAL       :     2.644410 sec
+     7,612,561,317      cycles                           #    2.874 GHz                       
+    18,461,919,918      instructions                     #    2.43  insn per cycle            
+       2.649591246 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028819e+00
+Avg ME (F77/C++)    = 2.0288193075684831
+Relative difference = 1.515997647531052e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.541523e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.750734e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.750734e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.313661 sec
+     3,560,755,118      cycles                           #    2.702 GHz                       
+     7,630,643,958      instructions                     #    2.14  insn per cycle            
+       1.318878863 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3260) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.877946e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.018670e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.018670e+06                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.268624 sec
+     3,424,190,966      cycles                           #    2.689 GHz                       
+     7,470,353,577      instructions                     #    2.18  insn per cycle            
+       1.273942336 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3167) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.776427e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.508703e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.508703e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.634117 sec
+     3,049,238,738      cycles                           #    1.861 GHz                       
+     5,249,598,512      instructions                     #    1.72  insn per cycle            
+       1.639464560 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2089)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
index ba391daf9b..a5afff3307 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,255 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:38:02
+DATE: 2025-09-24_08:55:55
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.962971e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.366502e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.366502e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.172268e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.565316e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.565316e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.683449 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,639,955,466      cycles                           #    2.881 GHz                    
-     4,089,465,491      instructions                     #    1.55  insn per cycle         
-       0.973820402 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
+TOTAL       :     0.742519 sec
+     2,926,334,960      cycles                           #    2.832 GHz                       
+     4,662,274,955      instructions                     #    1.59  insn per cycle            
+       1.090216738 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.921187e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.975107e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.975107e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.825336e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.875215e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.875215e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.590872 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    16,505,827,538      cycles                           #    2.949 GHz                    
-    45,383,324,587      instructions                     #    2.75  insn per cycle         
-       5.597525299 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.875911 sec
+    16,883,161,514      cycles                           #    2.871 GHz                       
+    46,712,150,637      instructions                     #    2.77  insn per cycle            
+       5.882470914 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.503675e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.835801e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.835801e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.059731e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.333877e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.333877e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.463825 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,301,370,898      cycles                           #    2.956 GHz                    
-    18,072,803,019      instructions                     #    2.48  insn per cycle         
-       2.471007950 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.718254 sec
+     7,814,141,803      cycles                           #    2.871 GHz                       
+    18,738,072,843      instructions                     #    2.40  insn per cycle            
+       2.725028734 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.228346e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.356902e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.356902e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.313202e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.453842e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.453842e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.409585 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,950,274,134      cycles                           #    2.790 GHz                    
-     8,500,615,795      instructions                     #    2.15  insn per cycle         
-       1.416669722 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.400636 sec
+     3,761,404,940      cycles                           #    2.678 GHz                       
+     7,863,817,826      instructions                     #    2.09  insn per cycle            
+       1.409322955 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3260) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.630316e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.908478e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.908478e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.757434e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.003220e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.003220e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.350838 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,776,468,219      cycles                           #    2.783 GHz                    
-     8,150,432,975      instructions                     #    2.16  insn per cycle         
-       1.357973048 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.333503 sec
+     3,624,049,741      cycles                           #    2.707 GHz                       
+     7,702,682,063      instructions                     #    2.13  insn per cycle            
+       1.340053856 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3167) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=524288)
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.446924e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.088794e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.088794e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.657185e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.366312e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.366312e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.766906 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,483,580,907      cycles                           #    1.964 GHz                    
-     6,352,443,418      instructions                     #    1.82  insn per cycle         
-       1.774118995 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.712600 sec
+     3,264,565,864      cycles                           #    1.900 GHz                       
+     5,499,475,316      instructions                     #    1.68  insn per cycle            
+       1.719139859 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2089)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183148950338
-Relative difference = 1.5521108056421764e-07
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
index eaf1557b5a..f41ca6aec0 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:50:12
+DATE: 2025-09-24_09:15:10
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.125576e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.707303e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.828418e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.873971e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.152674e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.172021e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079446e+00 +- 3.403306e-03 )  GeV^0
-TOTAL       :     0.579716 sec
-INFO: No Floating Point Exceptions have been reported
-     2,336,853,883      cycles                           #    2.860 GHz                    
-     3,355,823,518      instructions                     #    1.44  insn per cycle         
-       0.873538557 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.627107 sec
+     2,578,417,587      cycles                           #    2.833 GHz                       
+     3,892,656,259      instructions                     #    1.51  insn per cycle            
+       0.968128619 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.929027e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.983438e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.983438e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.831869e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.882089e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.882089e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079573e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     5.578220 sec
-INFO: No Floating Point Exceptions have been reported
-    16,412,792,219      cycles                           #    2.940 GHz                    
-    45,364,108,775      instructions                     #    2.76  insn per cycle         
-       5.583854256 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.868440 sec
+    16,861,585,353      cycles                           #    2.871 GHz                       
+    46,704,052,774      instructions                     #    2.77  insn per cycle            
+       5.873652139 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.528116e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.863028e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.863028e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.095423e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.376391e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.376391e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079572e+00 +- 3.404712e-03 )  GeV^0
-TOTAL       :     2.458830 sec
-INFO: No Floating Point Exceptions have been reported
-     7,256,357,914      cycles                           #    2.945 GHz                    
-    17,803,442,746      instructions                     #    2.45  insn per cycle         
-       2.464565338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.705119 sec
+     7,776,009,723      cycles                           #    2.870 GHz                       
+    18,473,510,094      instructions                     #    2.38  insn per cycle            
+       2.710275194 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.321630e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.466483e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.466483e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.568159e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.787921e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.787921e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.402951 sec
-INFO: No Floating Point Exceptions have been reported
-     3,915,341,003      cycles                           #    2.781 GHz                    
-     8,245,891,296      instructions                     #    2.11  insn per cycle         
-       1.408611815 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.367811 sec
+     3,715,855,776      cycles                           #    2.708 GHz                       
+     7,614,650,652      instructions                     #    2.05  insn per cycle            
+       1.373099751 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3260) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.769699e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.005525e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.005525e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.949337e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.028339e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.028339e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404207e-03 )  GeV^0
-TOTAL       :     1.339268 sec
-INFO: No Floating Point Exceptions have been reported
-     3,730,447,512      cycles                           #    2.775 GHz                    
-     7,861,984,465      instructions                     #    2.11  insn per cycle         
-       1.344998375 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.317122 sec
+     3,582,670,369      cycles                           #    2.711 GHz                       
+     7,419,662,797      instructions                     #    2.07  insn per cycle            
+       1.322406778 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3167) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --common OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.517692e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.188107e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.188107e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.768785e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.502325e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.502325e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.079550e+00 +- 3.404208e-03 )  GeV^0
-TOTAL       :     1.753383 sec
-INFO: No Floating Point Exceptions have been reported
-     3,445,483,739      cycles                           #    1.959 GHz                    
-     6,046,658,237      instructions                     #    1.75  insn per cycle         
-       1.759146158 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.693814 sec
+     3,215,725,762      cycles                           #    1.894 GHz                       
+     5,199,626,463      instructions                     #    1.62  insn per cycle            
+       1.699070891 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2089)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183148950338
-Relative difference = 1.5521108056421764e-07
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
index 0132142a7f..0caf85479e 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:47:21
+DATE: 2025-09-24_09:11:00
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.231900e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.718618e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.843172e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.873602e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.148657e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.166610e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.520956 sec
-INFO: No Floating Point Exceptions have been reported
-     2,145,908,279      cycles                           #    2.880 GHz                    
-     3,342,720,192      instructions                     #    1.56  insn per cycle         
-       0.802555619 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.571773 sec
+     2,418,396,415      cycles                           #    2.829 GHz                       
+     3,890,889,196      instructions                     #    1.61  insn per cycle            
+       0.911678328 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.929666e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.983661e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.983661e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.831816e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.881917e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.881917e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.518695 sec
-INFO: No Floating Point Exceptions have been reported
-    16,237,309,072      cycles                           #    2.940 GHz                    
-    45,332,194,999      instructions                     #    2.79  insn per cycle         
-       5.524338903 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.810849 sec
+    16,696,288,253      cycles                           #    2.871 GHz                       
+    46,674,633,233      instructions                     #    2.80  insn per cycle            
+       5.816081482 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.531812e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.871745e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.871745e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.100794e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.381264e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.381264e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.400101 sec
-INFO: No Floating Point Exceptions have been reported
-     7,092,917,063      cycles                           #    2.949 GHz                    
-    17,790,950,300      instructions                     #    2.51  insn per cycle         
-       2.405895056 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.644194 sec
+     7,609,575,934      cycles                           #    2.873 GHz                       
+    18,461,091,440      instructions                     #    2.43  insn per cycle            
+       2.649247827 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.364764e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.520513e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.520513e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.548599e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.763296e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.763296e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.339869 sec
-INFO: No Floating Point Exceptions have been reported
-     3,746,789,760      cycles                           #    2.786 GHz                    
-     8,261,610,745      instructions                     #    2.20  insn per cycle         
-       1.345882215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.313536 sec
+     3,555,615,490      cycles                           #    2.698 GHz                       
+     7,630,109,811      instructions                     #    2.15  insn per cycle            
+       1.318778185 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3260) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.818621e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.013746e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.013746e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.881190e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.019080e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.019080e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.275913 sec
-INFO: No Floating Point Exceptions have been reported
-     3,561,649,230      cycles                           #    2.781 GHz                    
-     7,911,264,889      instructions                     #    2.22  insn per cycle         
-       1.281614236 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.267655 sec
+     3,421,150,014      cycles                           #    2.690 GHz                       
+     7,468,696,042      instructions                     #    2.18  insn per cycle            
+       1.272664142 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3167) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.490214e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.139560e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.139560e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.775565e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.508708e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.508708e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.701983 sec
-INFO: No Floating Point Exceptions have been reported
-     3,270,370,699      cycles                           #    1.916 GHz                    
-     6,096,029,839      instructions                     #    1.86  insn per cycle         
-       1.707817189 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.635015 sec
+     3,048,650,953      cycles                           #    1.860 GHz                       
+     5,249,745,788      instructions                     #    1.72  insn per cycle            
+       1.640374800 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2089)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183148950338
-Relative difference = 1.5521108056421764e-07
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..884be120f5
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_noBlas.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_09:21:30
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.887556e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.151817e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.170495e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
+TOTAL       :     0.539147 sec
+     2,253,385,532      cycles                           #    2.821 GHz                       
+     3,406,414,544      instructions                     #    1.51  insn per cycle            
+       0.855586365 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028811e+00
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.834733e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.885495e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.885495e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
+TOTAL       :     5.801919 sec
+    16,697,530,123      cycles                           #    2.876 GHz                       
+    46,673,467,044      instructions                     #    2.80  insn per cycle            
+       5.807192464 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  664) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028820e+00
+Avg ME (F77/C++)    = 2.0288198669441044
+Relative difference = 6.558289825352968e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.095899e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.376126e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.376126e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
+TOTAL       :     2.647195 sec
+     7,612,237,553      cycles                           #    2.871 GHz                       
+    18,461,327,399      instructions                     #    2.43  insn per cycle            
+       2.652402692 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028819e+00
+Avg ME (F77/C++)    = 2.0288193075684831
+Relative difference = 1.515997647531052e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.573333e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.792686e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.792686e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.309469 sec
+     3,553,379,916      cycles                           #    2.705 GHz                       
+     7,630,260,456      instructions                     #    2.15  insn per cycle            
+       1.314555585 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3260) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.934339e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.026361e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.026361e+06                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.260503 sec
+     3,420,943,817      cycles                           #    2.705 GHz                       
+     7,469,256,523      instructions                     #    2.18  insn per cycle            
+       1.265599127 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3167) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288181869545951
+Relative difference = 9.214951531400725e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.782654e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.516737e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.516737e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
+TOTAL       :     1.632915 sec
+     3,044,957,630      cycles                           #    1.860 GHz                       
+     5,248,782,492      instructions                     #    1.72  insn per cycle            
+       1.638128824 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2089)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028818e+00
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
index 55c92f68ec..759d4f650e 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,238 +10,219 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:44:35
+DATE: 2025-09-24_09:07:10
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.418560e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.722658e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.839243e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.205380e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.154806e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.174134e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086805e+00 +- 3.414078e-03 )  GeV^0
-TOTAL       :     0.630221 sec
-INFO: No Floating Point Exceptions have been reported
-     2,475,236,721      cycles                           #    2.897 GHz                    
-     3,823,734,565      instructions                     #    1.54  insn per cycle         
-       0.911361538 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
+TOTAL       :     0.682695 sec
+     2,718,889,879      cycles                           #    2.833 GHz                       
+     4,277,705,603      instructions                     #    1.57  insn per cycle            
+       1.019275785 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.933112e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.987540e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.987540e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.831310e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.881527e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.881527e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.510619 sec
-INFO: No Floating Point Exceptions have been reported
-    16,239,692,933      cycles                           #    2.945 GHz                    
-    45,332,021,728      instructions                     #    2.79  insn per cycle         
-       5.516250908 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  591) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.813015 sec
+    16,694,502,398      cycles                           #    2.870 GHz                       
+    46,672,918,750      instructions                     #    2.80  insn per cycle            
+       5.818316700 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  664) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.528380e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.868469e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.868469e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.098841e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.378793e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.378793e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.402631 sec
-INFO: No Floating Point Exceptions have been reported
-     7,087,618,340      cycles                           #    2.944 GHz                    
-    17,790,727,043      instructions                     #    2.51  insn per cycle         
-       2.408346877 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3144) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.645691 sec
+     7,609,537,591      cycles                           #    2.871 GHz                       
+    18,461,181,243      instructions                     #    2.43  insn per cycle            
+       2.651091423 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3497) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.367783e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.536121e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.536121e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.498934e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.696715e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.696715e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.339197 sec
-INFO: No Floating Point Exceptions have been reported
-     3,748,433,186      cycles                           #    2.789 GHz                    
-     8,262,218,774      instructions                     #    2.20  insn per cycle         
-       1.344812605 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3367) (512y:    0) (512z:    0)
+TOTAL       :     1.319589 sec
+     3,558,633,235      cycles                           #    2.688 GHz                       
+     7,630,430,725      instructions                     #    2.14  insn per cycle            
+       1.324822819 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3260) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.816225e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.011910e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.011910e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.947671e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.027956e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.027956e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.274973 sec
-INFO: No Floating Point Exceptions have been reported
-     3,561,414,995      cycles                           #    2.782 GHz                    
-     7,912,015,045      instructions                     #    2.22  insn per cycle         
-       1.280637958 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3209) (512y:   20) (512z:    0)
+TOTAL       :     1.258901 sec
+     3,419,186,035      cycles                           #    2.707 GHz                       
+     7,468,947,821      instructions                     #    2.18  insn per cycle            
+       1.264161665 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3167) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.504790e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.157762e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.157762e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.771440e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.505025e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.505025e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.699252 sec
-INFO: No Floating Point Exceptions have been reported
-     3,270,672,138      cycles                           #    1.919 GHz                    
-     6,095,863,693      instructions                     #    1.86  insn per cycle         
-       1.704973507 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2251) (512y:   22) (512z: 2155)
+TOTAL       :     1.635920 sec
+     3,047,206,646      cycles                           #    1.858 GHz                       
+     5,249,432,639      instructions                     #    1.72  insn per cycle            
+       1.641242692 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2089)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183148950338
-Relative difference = 1.5521108056421764e-07
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
index 5e80ecf473..c9756b5eaf 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:59:52
+DATE: 2025-09-24_07:47:34
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.326131e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.746336e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.856838e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.861885e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.158531e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.176528e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.493394 sec
-INFO: No Floating Point Exceptions have been reported
-     2,062,281,894      cycles                           #    2.861 GHz                    
-     2,938,913,241      instructions                     #    1.43  insn per cycle         
-       0.784913836 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 126
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.540067 sec
+     2,319,774,330      cycles                           #    2.822 GHz                       
+     3,451,101,558      instructions                     #    1.49  insn per cycle            
+       0.879382497 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.953822e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.011638e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.011638e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.863519e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.915445e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.915445e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     5.471139 sec
-INFO: No Floating Point Exceptions have been reported
-    16,020,529,034      cycles                           #    2.925 GHz                    
-    44,492,038,074      instructions                     #    2.78  insn per cycle         
-       5.480388445 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  536) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.714163 sec
+    16,436,244,073      cycles                           #    2.874 GHz                       
+    45,749,025,669      instructions                     #    2.78  insn per cycle            
+       5.719759349 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  609) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
 Avg ME (F77/C++)    = 2.0288198669441044
 Relative difference = 6.558289825352968e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.317220e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.788673e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.788673e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.094180e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.374844e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.374844e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.075008 sec
-INFO: No Floating Point Exceptions have been reported
-     6,135,177,420      cycles                           #    2.947 GHz                    
-    17,131,917,948      instructions                     #    2.79  insn per cycle         
-       2.082995277 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2863) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.648909 sec
+     7,622,554,704      cycles                           #    2.873 GHz                       
+    18,456,491,216      instructions                     #    2.42  insn per cycle            
+       2.654484927 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3494) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
 Avg ME (F77/C++)    = 2.0288193075684831
 Relative difference = 1.515997647531052e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.077036e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.672972e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.672972e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.518596e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.723041e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.723041e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.827961 sec
-INFO: No Floating Point Exceptions have been reported
-     5,098,745,585      cycles                           #    2.778 GHz                    
-    10,277,927,063      instructions                     #    2.02  insn per cycle         
-       1.836088116 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3907) (512y:    0) (512z:    0)
+TOTAL       :     1.318146 sec
+     3,562,654,657      cycles                           #    2.694 GHz                       
+     7,627,422,650      instructions                     #    2.14  insn per cycle            
+       1.323687691 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3243) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.138089e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.753320e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.753320e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.928783e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.025973e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.025973e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.811229 sec
-INFO: No Floating Point Exceptions have been reported
-     5,047,478,028      cycles                           #    2.778 GHz                    
-    10,048,355,032      instructions                     #    1.99  insn per cycle         
-       1.819572790 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3806) (512y:    2) (512z:    0)
+TOTAL       :     1.261737 sec
+     3,419,342,835      cycles                           #    2.700 GHz                       
+     7,466,037,637      instructions                     #    2.18  insn per cycle            
+       1.267292458 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3148) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
 Avg ME (F77/C++)    = 2.0288181869545951
 Relative difference = 9.214951531400725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.690006e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.022722e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.022722e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.769269e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.500411e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.500411e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     2.339710 sec
-INFO: No Floating Point Exceptions have been reported
-     4,430,484,038      cycles                           #    1.888 GHz                    
-     8,494,687,635      instructions                     #    1.92  insn per cycle         
-       2.347901015 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2746) (512y:    4) (512z: 2754)
+TOTAL       :     1.636941 sec
+     3,055,793,005      cycles                           #    1.861 GHz                       
+     5,247,434,771      instructions                     #    1.72  insn per cycle            
+       1.642577316 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2056) (512y:    5) (512z: 2089)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183148950338
-Relative difference = 1.5521108056421764e-07
+Avg ME (F77/C++)    = 2.0288183143129572
+Relative difference = 1.5492417626371624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
index 8666f655aa..c154d25891 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:29:20
+DATE: 2025-09-24_08:47:00
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.502979e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.757241e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.878370e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.866634e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.147053e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.165837e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.488684 sec
-INFO: No Floating Point Exceptions have been reported
-     2,072,092,086      cycles                           #    2.888 GHz                    
-     2,980,809,123      instructions                     #    1.44  insn per cycle         
-       0.774128701 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.538880 sec
+     2,314,341,368      cycles                           #    2.820 GHz                       
+     3,436,565,941      instructions                     #    1.48  insn per cycle            
+       0.878082792 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.497944e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.591831e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.591831e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.008534e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.069275e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.069275e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.301034 sec
-INFO: No Floating Point Exceptions have been reported
-    12,652,758,977      cycles                           #    2.937 GHz                    
-    34,660,886,060      instructions                     #    2.74  insn per cycle         
-       4.309086604 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  683) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.307798 sec
+    15,234,969,835      cycles                           #    2.868 GHz                       
+    42,317,180,883      instructions                     #    2.78  insn per cycle            
+       5.313067205 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  720) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
-Avg ME (F77/C++)    = 2.0288199094356969
-Relative difference = 4.463890496342449e-08
+Avg ME (F77/C++)    = 2.0288198612397537
+Relative difference = 6.839455762672188e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.170038e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.622090e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.622090e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.541314e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.891142e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.891142e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.133248 sec
-INFO: No Floating Point Exceptions have been reported
-     6,307,478,134      cycles                           #    2.947 GHz                    
-    14,873,781,997      instructions                     #    2.36  insn per cycle         
-       2.140857047 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2975) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.396788 sec
+     6,875,094,670      cycles                           #    2.863 GHz                       
+    16,341,506,518      instructions                     #    2.38  insn per cycle            
+       2.402179834 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3288) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
-Avg ME (F77/C++)    = 2.0288193755550310
-Relative difference = 1.8511017053446366e-07
+Avg ME (F77/C++)    = 2.0288193161832169
+Relative difference = 1.5584594630759239e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.248492e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.104502e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.104502e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.990295e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.036634e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.036634e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.548692 sec
-INFO: No Floating Point Exceptions have been reported
-     4,331,332,767      cycles                           #    2.784 GHz                    
-     9,119,017,787      instructions                     #    2.11  insn per cycle         
-       1.556682967 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4456) (512y:    0) (512z:    0)
+TOTAL       :     1.253414 sec
+     3,371,756,397      cycles                           #    2.681 GHz                       
+     6,839,443,832      instructions                     #    2.03  insn per cycle            
+       1.258730374 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3139) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182069780305
-Relative difference = 1.0201902325125583e-07
+Avg ME (F77/C++)    = 2.0288181848591194
+Relative difference = 9.111665983220217e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.353371e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.251881e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.251881e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.825152e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.146044e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.146044e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.529166 sec
-INFO: No Floating Point Exceptions have been reported
-     4,288,032,705      cycles                           #    2.791 GHz                    
-     8,709,611,506      instructions                     #    2.03  insn per cycle         
-       1.537124060 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4233) (512y:    0) (512z:    0)
+TOTAL       :     1.154321 sec
+     3,123,268,751      cycles                           #    2.696 GHz                       
+     6,388,429,640      instructions                     #    2.05  insn per cycle            
+       1.159462810 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2886) (512y:    7) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182069780305
-Relative difference = 1.0201902325125583e-07
+Avg ME (F77/C++)    = 2.0288181848591194
+Relative difference = 9.111665983220217e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.411255e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.862053e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.862053e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.255889e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.103086e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.103086e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     2.041395 sec
-INFO: No Floating Point Exceptions have been reported
-     3,904,121,018      cycles                           #    1.906 GHz                    
-     7,856,412,999      instructions                     #    2.01  insn per cycle         
-       2.049301951 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4273) (512y:    0) (512z: 2558)
+TOTAL       :     1.532528 sec
+     2,868,652,585      cycles                           #    1.867 GHz                       
+     4,790,852,156      instructions                     #    1.67  insn per cycle            
+       1.537786537 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1862) (512y:    5) (512z: 1976)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183246739209
-Relative difference = 1.6003107281264138e-07
+Avg ME (F77/C++)    = 2.0288183073280379
+Relative difference = 1.514813246576993e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
index 74b1cf75ec..58c504e753 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:29:40
+DATE: 2025-09-24_08:47:30
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.573239e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.755917e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.881516e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.874108e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.155948e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.175666e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086719e+00 +- 3.413389e-03 )  GeV^0
-TOTAL       :     0.487451 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,657,057      cycles                           #    2.894 GHz                    
-     2,969,147,079      instructions                     #    1.44  insn per cycle         
-       0.771604792 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 126
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.548499 sec
+     2,349,082,705      cycles                           #    2.826 GHz                       
+     3,444,335,519      instructions                     #    1.47  insn per cycle            
+       0.888498013 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028811e+00
-Avg ME (F77/GPU)   = 2.0288499356247485
-Relative difference = 1.9191351362116207e-05
+Avg ME (F77/GPU)   = 2.0288499490125105
+Relative difference = 1.9197950183795553e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.674902e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.781976e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.781976e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.004031e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.064190e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.064190e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086780e+00 +- 3.413794e-03 )  GeV^0
-TOTAL       :     4.022349 sec
-INFO: No Floating Point Exceptions have been reported
-    11,884,847,246      cycles                           #    2.950 GHz                    
-    35,128,022,846      instructions                     #    2.96  insn per cycle         
-       4.030241157 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  453) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.318944 sec
+    15,280,578,599      cycles                           #    2.871 GHz                       
+    42,518,914,092      instructions                     #    2.78  insn per cycle            
+       5.324125018 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  642) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028820e+00
-Avg ME (F77/C++)    = 2.0288199094356969
-Relative difference = 4.463890496342449e-08
+Avg ME (F77/C++)    = 2.0288198620546609
+Relative difference = 6.799289200198014e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.473588e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.982990e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.982990e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.572000e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.924083e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.924083e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086779e+00 +- 3.413793e-03 )  GeV^0
-TOTAL       :     2.018275 sec
-INFO: No Floating Point Exceptions have been reported
-     5,977,087,994      cycles                           #    2.951 GHz                    
-    14,582,659,278      instructions                     #    2.44  insn per cycle         
-       2.026172081 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2569) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.380861 sec
+     6,844,477,071      cycles                           #    2.870 GHz                       
+    16,273,997,793      instructions                     #    2.38  insn per cycle            
+       2.386295475 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3006) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028819e+00
-Avg ME (F77/C++)    = 2.0288193583255634
-Relative difference = 1.7661780742548925e-07
+Avg ME (F77/C++)    = 2.0288193161832169
+Relative difference = 1.5584594630759239e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.377553e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.279187e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.279187e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.075588e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.046375e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.046375e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.524553 sec
-INFO: No Floating Point Exceptions have been reported
-     4,234,763,555      cycles                           #    2.764 GHz                    
-     8,897,798,804      instructions                     #    2.10  insn per cycle         
-       1.532761317 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3552) (512y:    0) (512z:    0)
+TOTAL       :     1.242091 sec
+     3,350,889,709      cycles                           #    2.689 GHz                       
+     6,722,008,368      instructions                     #    2.01  insn per cycle            
+       1.247472305 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2661) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182107033208
-Relative difference = 1.0385521077446488e-07
+Avg ME (F77/C++)    = 2.0288181760115549
+Relative difference = 8.67557144645807e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.495273e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.420338e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.420338e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.976555e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.166609e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.166609e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     1.502506 sec
-INFO: No Floating Point Exceptions have been reported
-     4,214,392,060      cycles                           #    2.792 GHz                    
-     8,461,762,117      instructions                     #    2.01  insn per cycle         
-       1.510417354 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3296) (512y:    0) (512z:    0)
+TOTAL       :     1.138185 sec
+     3,060,302,865      cycles                           #    2.678 GHz                       
+     6,320,589,737      instructions                     #    2.07  insn per cycle            
+       1.143708310 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2450) (512y:    7) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288182107033208
-Relative difference = 1.0385521077446488e-07
+Avg ME (F77/C++)    = 2.0288181760115549
+Relative difference = 8.67557144645807e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.487070e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.949626e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.949626e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.207768e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.045909e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.045909e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086810e+00 +- 3.414231e-03 )  GeV^0
-TOTAL       :     2.014420 sec
-INFO: No Floating Point Exceptions have been reported
-     3,856,759,695      cycles                           #    1.908 GHz                    
-     7,749,847,516      instructions                     #    2.01  insn per cycle         
-       2.022398856 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3289) (512y:    0) (512z: 2110)
+TOTAL       :     1.542518 sec
+     2,878,874,944      cycles                           #    1.861 GHz                       
+     4,765,123,226      instructions                     #    1.66  insn per cycle            
+       1.547749585 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1355) (512y:   22) (512z: 1745)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028818e+00
-Avg ME (F77/C++)    = 2.0288183204829693
-Relative difference = 1.5796536184903122e-07
+Avg ME (F77/C++)    = 2.0288183073280379
+Relative difference = 1.514813246576993e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..8e04255902
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:17:15
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+8.034851e+05    1 256
+1.570032e+06    2 256
+3.267619e+06    4 256
+6.473638e+06    8 256
+9.903743e+06   16 256
+1.026330e+07   32 256
+9.922907e+06   64 256
+1.019578e+07  128 256
+1.041824e+07  256 256
+1.041387e+07  512 256
+1.043869e+07 1024 256
+### GPU: scaling test 32
+1.089447e+05    1  32
+2.100696e+05    2  32
+4.218867e+05    4  32
+8.222443e+05    8  32
+1.614383e+06   16  32
+3.233783e+06   32  32
+6.305788e+06   64  32
+1.005593e+07  128  32
+1.031516e+07  256  32
+9.639949e+06  512  32
+1.002191e+07 1024  32
+1.025487e+07 2048  32
+1.030770e+07 4096  32
+1.032661e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.651184e+05    1 256
+1.714247e+05    2 256
+1.762711e+05    4 256
+### CPU: scaling test 32
+1.670730e+05    1  32
+1.646840e+05    2  32
+1.678567e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.882857e+05    1 256
+2.987691e+05    2 256
+2.934247e+05    4 256
+### CPU: scaling test 32
+2.791152e+05    1  32
+2.799466e+05    2  32
+2.522058e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.293894e+05    1 256
+4.911285e+05    2 256
+5.301102e+05    4 256
+### CPU: scaling test 32
+5.227648e+05    1  32
+4.919785e+05    2  32
+5.143165e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.210348e+05    1 256
+5.219334e+05    2 256
+5.165907e+05    4 256
+### CPU: scaling test 32
+5.552375e+05    1  32
+5.620593e+05    2  32
+5.696104e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.434605e+05    1 256
+3.468478e+05    2 256
+3.526863e+05    4 256
+### CPU: scaling test 32
+3.476674e+05    1  32
+3.530762e+05    2  32
+3.452219e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
index 46bc87b45e..9512e6842c 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:58:41
+DATE: 2025-09-24_07:45:51
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.456560e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.379988e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.000705e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.420550e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.051392e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.058569e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.534501 sec
-INFO: No Floating Point Exceptions have been reported
-     2,219,584,721      cycles                           #    2.878 GHz                    
-     3,138,987,562      instructions                     #    1.41  insn per cycle         
-       0.829330920 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.624089 sec
+     2,740,298,329      cycles                           #    2.837 GHz                       
+     4,376,999,754      instructions                     #    1.60  insn per cycle            
+       1.027657731 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063423243874
-Relative difference = 3.241686432649386e-07
+Avg ME (F77/GPU)   = 2.0288063943199761
+Relative difference = 2.9853999119330943e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.813220e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.859845e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.859845e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.739456e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.782512e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.782512e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.928691 sec
-INFO: No Floating Point Exceptions have been reported
-    17,514,594,449      cycles                           #    2.949 GHz                    
-    46,201,641,620      instructions                     #    2.64  insn per cycle         
-       5.940965337 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  617) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.137137 sec
+    17,632,069,705      cycles                           #    2.871 GHz                       
+    47,071,786,706      instructions                     #    2.67  insn per cycle            
+       6.142721284 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.229159e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.395479e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.395479e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.969738e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.110688e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.110688e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.394540 sec
-INFO: No Floating Point Exceptions have been reported
-    10,052,901,757      cycles                           #    2.953 GHz                    
-    27,702,324,481      instructions                     #    2.76  insn per cycle         
-       3.406321535 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.640304 sec
+    10,461,431,755      cycles                           #    2.870 GHz                       
+    28,885,201,179      instructions                     #    2.76  insn per cycle            
+       3.646099818 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2958) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.062332e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.465524e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.465524e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.866190e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.240249e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.240249e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.217135 sec
-INFO: No Floating Point Exceptions have been reported
-     6,171,509,914      cycles                           #    2.770 GHz                    
-    12,603,170,569      instructions                     #    2.04  insn per cycle         
-       2.229995554 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2773) (512y:    0) (512z:    0)
+TOTAL       :     2.263992 sec
+     6,074,776,768      cycles                           #    2.677 GHz                       
+    12,331,537,932      instructions                     #    2.03  insn per cycle            
+       2.269808090 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2878) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.580384e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.068896e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.068896e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.181720e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.599871e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.599871e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.025421 sec
-INFO: No Floating Point Exceptions have been reported
-     5,651,741,681      cycles                           #    2.776 GHz                    
-    12,038,443,177      instructions                     #    2.13  insn per cycle         
-       2.038138408 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2518) (512y:  146) (512z:    0)
+TOTAL       :     2.132428 sec
+     5,740,054,892      cycles                           #    2.685 GHz                       
+    11,983,118,985      instructions                     #    2.09  insn per cycle            
+       2.138292263 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2710) (512y:   49) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.630973e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.831034e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.831034e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.349584e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.520024e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.520024e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.034632 sec
-INFO: No Floating Point Exceptions have been reported
-     5,740,712,408      cycles                           #    1.885 GHz                    
-     8,225,599,297      instructions                     #    1.43  insn per cycle         
-       3.047056631 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1671) (512y:  126) (512z: 1862)
+TOTAL       :     3.239992 sec
+     5,775,149,982      cycles                           #    1.780 GHz                       
+     7,815,004,616      instructions                     #    1.35  insn per cycle            
+       3.245774275 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1669) (512y:   69) (512z: 1931)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..238832770b
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:37:03
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.361105e+05    1 256
+4.555334e+05    2 256
+8.473441e+05    4 256
+1.550614e+06    8 256
+2.631040e+06   16 256
+4.137873e+06   32 256
+4.813875e+06   64 256
+5.487026e+06  128 256
+5.764957e+06  256 256
+5.992209e+06  512 256
+6.106508e+06 1024 256
+### GPU: scaling test 32
+2.958875e+04    1  32
+6.085178e+04    2  32
+1.192154e+05    4  32
+2.314980e+05    8  32
+4.464807e+05   16  32
+8.917880e+05   32  32
+1.537449e+06   64  32
+2.798692e+06  128  32
+3.996433e+06  256  32
+4.779533e+06  512  32
+5.364461e+06 1024  32
+5.738288e+06 2048  32
+5.946428e+06 4096  32
+6.037437e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.685006e+05    1 256
+1.723354e+05    2 256
+1.638859e+05    4 256
+### CPU: scaling test 32
+1.369740e+05    1  32
+1.491240e+05    2  32
+1.579015e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.727240e+05    1 256
+3.013403e+05    2 256
+2.999575e+05    4 256
+### CPU: scaling test 32
+2.693875e+05    1  32
+2.195495e+05    2  32
+2.692487e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.889061e+05    1 256
+4.863458e+05    2 256
+5.265835e+05    4 256
+### CPU: scaling test 32
+4.856285e+05    1  32
+5.290699e+05    2  32
+5.203400e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.207613e+05    1 256
+5.180923e+05    2 256
+5.299914e+05    4 256
+### CPU: scaling test 32
+5.124756e+05    1  32
+5.235731e+05    2  32
+5.103201e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.408412e+05    1 256
+3.500331e+05    2 256
+3.527965e+05    4 256
+### CPU: scaling test 32
+3.110934e+05    1  32
+2.928378e+05    2  32
+3.461602e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..a87f7a8546
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_blasOn.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_08:31:51
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 5.870978e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.293708e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.319514e+06                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     1.390916 sec
+     5,054,727,678      cycles                           #    2.837 GHz                       
+     7,164,050,823      instructions                     #    1.42  insn per cycle            
+       1.855053661 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288064033535846
+Relative difference = 2.940873209649997e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.739239e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.782320e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.782320e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     6.137107 sec
+    17,636,122,301      cycles                           #    2.872 GHz                       
+    47,073,973,330      instructions                     #    2.67  insn per cycle            
+       6.142795081 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 2.974256e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.114750e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.114750e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.634370 sec
+    10,446,510,479      cycles                           #    2.871 GHz                       
+    28,885,820,804      instructions                     #    2.77  insn per cycle            
+       3.640145208 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2958) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.880489e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.259855e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.259855e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.257451 sec
+     6,073,853,734      cycles                           #    2.685 GHz                       
+    12,332,121,068      instructions                     #    2.03  insn per cycle            
+       2.263129601 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2878) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.165320e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.583342e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.583342e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.138471 sec
+     5,745,518,950      cycles                           #    2.681 GHz                       
+    11,981,716,362      instructions                     #    2.09  insn per cycle            
+       2.143992444 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2710) (512y:   49) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.356813e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.527339e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.527339e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.232530 sec
+     5,761,140,182      cycles                           #    1.780 GHz                       
+     7,815,145,650      instructions                     #    1.36  insn per cycle            
+       3.238204918 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1669) (512y:   69) (512z: 1931)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..b5b337f1e5
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0_noBlas.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+
+DATE: 2025-09-24_09:20:54
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 9.264552e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.050944e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.059373e+07                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     0.628494 sec
+     2,717,437,197      cycles                           #    2.834 GHz                       
+     4,420,610,573      instructions                     #    1.63  insn per cycle            
+       1.016415872 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 2.028807e+00
+Avg ME (F77/GPU)   = 2.0288063943199761
+Relative difference = 2.9853999119330943e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.741970e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.785210e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.785210e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     6.127535 sec
+    17,619,918,709      cycles                           #    2.874 GHz                       
+    47,072,142,698      instructions                     #    2.67  insn per cycle            
+       6.132916216 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  690) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 2.973024e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.113129e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.113129e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.635704 sec
+    10,451,978,137      cycles                           #    2.872 GHz                       
+    28,884,800,063      instructions                     #    2.76  insn per cycle            
+       3.641136554 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2958) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288063903750300
+Relative difference = 3.0048445715164216e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 4.893038e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.274118e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.274118e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.252307 sec
+     6,067,632,575      cycles                           #    2.688 GHz                       
+    12,331,256,225      instructions                     #    2.03  insn per cycle            
+       2.257676783 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2878) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 5.165992e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.582198e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.582198e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     2.138263 sec
+     5,738,986,912      cycles                           #    2.678 GHz                       
+    11,983,127,376      instructions                     #    2.09  insn per cycle            
+       2.143816952 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2710) (512y:   49) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.358397e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.528203e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.528203e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
+TOTAL       :     3.229987 sec
+     5,756,957,930      cycles                           #    1.780 GHz                       
+     7,814,408,912      instructions                     #    1.36  insn per cycle            
+       3.235396193 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1669) (512y:   69) (512z: 1931)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 2.028807e+00
+Avg ME (F77/C++)    = 2.0288064057068964
+Relative difference = 2.9292737240031234e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
index ffa5410982..82fc0394a2 100644
--- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_08:59:06
+DATE: 2025-09-24_07:46:30
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.422071e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.351796e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.985674e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.382296e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.046566e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.053685e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     0.535641 sec
-INFO: No Floating Point Exceptions have been reported
-     2,214,747,611      cycles                           #    2.879 GHz                    
-     3,172,033,471      instructions                     #    1.43  insn per cycle         
-       0.829540839 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.623847 sec
+     2,768,474,538      cycles                           #    2.832 GHz                       
+     4,429,449,797      instructions                     #    1.60  insn per cycle            
+       1.035793687 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 98
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.028807e+00
-Avg ME (F77/GPU)   = 2.0288063423243874
-Relative difference = 3.241686432649386e-07
+Avg ME (F77/GPU)   = 2.0288063943199761
+Relative difference = 2.9853999119330943e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.862163e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.911340e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.911340e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.795413e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.841835e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.841835e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     5.777703 sec
-INFO: No Floating Point Exceptions have been reported
-    17,097,861,095      cycles                           #    2.954 GHz                    
-    45,230,787,591      instructions                     #    2.65  insn per cycle         
-       5.789414615 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  568) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.947704 sec
+    17,088,961,971      cycles                           #    2.871 GHz                       
+    45,931,713,417      instructions                     #    2.69  insn per cycle            
+       5.953595197 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  633) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.356972e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.536408e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.536408e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.986501e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.128158e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.128158e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.270231 sec
-INFO: No Floating Point Exceptions have been reported
-     9,665,855,757      cycles                           #    2.946 GHz                    
-    26,370,377,514      instructions                     #    2.73  insn per cycle         
-       3.281726897 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.619942 sec
+    10,404,246,476      cycles                           #    2.871 GHz                       
+    28,876,204,491      instructions                     #    2.78  insn per cycle            
+       3.625766762 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2953) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288063903750300
 Relative difference = 3.0048445715164216e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.515319e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.832036e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.832036e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.863046e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.235612e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.235612e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.470441 sec
-INFO: No Floating Point Exceptions have been reported
-     6,884,599,220      cycles                           #    2.774 GHz                    
-    14,150,233,239      instructions                     #    2.06  insn per cycle         
-       2.482504065 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2896) (512y:    0) (512z:    0)
+TOTAL       :     2.265397 sec
+     6,071,670,861      cycles                           #    2.675 GHz                       
+    12,327,353,870      instructions                     #    2.03  insn per cycle            
+       2.271397009 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2866) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.744762e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.096792e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.096792e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.132632e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.541433e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.541433e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     2.356796 sec
-INFO: No Floating Point Exceptions have been reported
-     6,551,408,744      cycles                           #    2.767 GHz                    
-    13,642,717,150      instructions                     #    2.08  insn per cycle         
-       2.368190066 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2535) (512y:  302) (512z:    0)
+TOTAL       :     2.149810 sec
+     5,735,457,307      cycles                           #    2.663 GHz                       
+    11,976,172,204      instructions                     #    2.09  insn per cycle            
+       2.154912250 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2696) (512y:   49) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.568399e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.763148e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.763148e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.326526e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.493190e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.493190e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.086689e+00 +- 3.413217e-03 )  GeV^0
-TOTAL       :     3.086761 sec
-INFO: No Floating Point Exceptions have been reported
-     5,741,113,391      cycles                           #    1.854 GHz                    
-     9,326,512,235      instructions                     #    1.62  insn per cycle         
-       3.098253222 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1456) (512y:  212) (512z: 2060)
+TOTAL       :     3.259408 sec
+     5,774,724,967      cycles                           #    1.770 GHz                       
+     7,811,134,891      instructions                     #    1.35  insn per cycle            
+       3.264533598 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:   69) (512z: 1931)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.028807e+00
 Avg ME (F77/C++)    = 2.0288064057068964
 Relative difference = 2.9292737240031234e-07
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..0430c29866
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-09-24_08:17:56
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.170210e+05    1 256
+2.125984e+05    2 256
+3.959312e+05    4 256
+7.383690e+05    8 256
+1.256510e+06   16 256
+1.302525e+06   32 256
+1.137351e+06   64 256
+1.148455e+06  128 256
+1.155387e+06  256 256
+1.176497e+06  512 256
+1.174053e+06 1024 256
+### GPU: scaling test 32
+1.722763e+04    1  32
+3.648322e+04    2  32
+6.851189e+04    4  32
+1.336462e+05    8  32
+2.554053e+05   16  32
+4.599070e+05   32  32
+8.068325e+05   64  32
+1.288259e+06  128  32
+1.347518e+06  256  32
+1.114318e+06  512  32
+1.117513e+06 1024  32
+1.106788e+06 2048  32
+1.132284e+06 4096  32
+1.131009e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.295968e+04    1 256
+2.304103e+04    2 256
+2.327288e+04    4 256
+### CPU: scaling test 32
+2.147053e+04    1  32
+2.182408e+04    2  32
+2.202495e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.224019e+04    1 256
+4.240941e+04    2 256
+4.252698e+04    4 256
+### CPU: scaling test 32
+3.980070e+04    1  32
+3.623199e+04    2  32
+4.108926e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.181243e+04    1 256
+8.010987e+04    2 256
+8.447301e+04    4 256
+### CPU: scaling test 32
+8.608051e+04    1  32
+8.766103e+04    2  32
+7.450255e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.863253e+04    1 256
+8.758686e+04    2 256
+9.198692e+04    4 256
+### CPU: scaling test 32
+9.391931e+04    1  32
+8.586478e+04    2  32
+8.665993e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.072923e+04    1 256
+6.991165e+04    2 256
+7.056341e+04    4 256
+### CPU: scaling test 32
+7.025617e+04    1  32
+6.971883e+04    2  32
+6.550582e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
index 028292e268..c992d10b68 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:00:14
+DATE: 2025-09-24_07:48:07
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.612194e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.849217e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.964394e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.112133e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.140622e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.142508e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.477846 sec
-INFO: No Floating Point Exceptions have been reported
-     1,998,983,760      cycles                           #    2.871 GHz                    
-     2,812,176,587      instructions                     #    1.41  insn per cycle         
-       0.759674168 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.605411 sec
+     2,421,511,234      cycles                           #    2.827 GHz                       
+     3,594,444,056      instructions                     #    1.48  insn per cycle            
+       0.913457815 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.042987e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.232338e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242858e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.621402 sec
-INFO: No Floating Point Exceptions have been reported
-     2,510,286,495      cycles                           #    2.883 GHz                    
-     3,752,986,245      instructions                     #    1.50  insn per cycle         
-       0.931747637 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 104
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213684418649
-Relative difference = 4.469239988637851e-07
+Avg ME (F77/GPU)   = 1.4131213684418646
+Relative difference = 4.4692399902091566e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.434605e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.446812e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.446812e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.312222e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.323523e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.323523e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.752117 sec
-INFO: No Floating Point Exceptions have been reported
-    19,916,103,310      cycles                           #    2.949 GHz                    
-    59,916,518,373      instructions                     #    3.01  insn per cycle         
-       6.756066066 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.107355 sec
+    20,283,539,678      cycles                           #    2.853 GHz                       
+    59,993,146,243      instructions                     #    2.96  insn per cycle            
+       7.111396668 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684432433
-Relative difference = 4.46923023397472e-07
+Avg ME (F77/C++)    = 1.4131213684432431
+Relative difference = 4.4692302355460254e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.568526e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.611480e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.611480e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.232536e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.270759e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.270759e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.606956 sec
-INFO: No Floating Point Exceptions have been reported
-    10,571,212,167      cycles                           #    2.928 GHz                    
-    31,086,653,440      instructions                     #    2.94  insn per cycle         
-       3.611892241 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.891383 sec
+    11,201,770,072      cycles                           #    2.876 GHz                       
+    32,050,630,389      instructions                     #    2.86  insn per cycle            
+       3.895464161 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5777) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684432433
-Relative difference = 4.46923023397472e-07
+Avg ME (F77/C++)    = 1.4131213684432435
+Relative difference = 4.4692302324034146e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.091675e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.256165e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.256165e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.513064e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.666589e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.666589e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.823998 sec
-INFO: No Floating Point Exceptions have been reported
-     4,999,238,647      cycles                           #    2.738 GHz                    
-    11,406,827,724      instructions                     #    2.28  insn per cycle         
-       1.827985092 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4642) (512y:    0) (512z:    0)
+TOTAL       :     1.945463 sec
+     5,194,923,693      cycles                           #    2.666 GHz                       
+    11,932,633,899      instructions                     #    2.30  insn per cycle            
+       1.949544641 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4713) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.026950e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.047965e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.047965e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.375739e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.559826e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.559826e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.617207 sec
-INFO: No Floating Point Exceptions have been reported
-     4,447,500,259      cycles                           #    2.747 GHz                    
-    10,665,398,274      instructions                     #    2.40  insn per cycle         
-       1.621167175 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4378) (512y:   92) (512z:    0)
+TOTAL       :     1.768434 sec
+     4,741,825,240      cycles                           #    2.676 GHz                       
+    11,283,684,571      instructions                     #    2.38  insn per cycle            
+       1.772501235 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4515) (512y:   55) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.168386e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.273905e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.273905e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.970736e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.074987e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.074987e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.309115 sec
-INFO: No Floating Point Exceptions have been reported
-     4,128,751,307      cycles                           #    1.785 GHz                    
-     5,972,449,468      instructions                     #    1.45  insn per cycle         
-       2.314144205 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1620) (512y:   94) (512z: 3577)
+TOTAL       :     2.373564 sec
+     4,058,654,330      cycles                           #    1.708 GHz                       
+     5,985,363,778      instructions                     #    1.47  insn per cycle            
+       2.377708670 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1486) (512y:   61) (512z: 4162)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684416484
-Relative difference = 4.469241520660492e-07
+Avg ME (F77/C++)    = 1.4131213684416482
+Relative difference = 4.4692415222317974e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
index 76636470b0..ad07f72fbe 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,275 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:38:23
+DATE: 2025-09-24_08:56:23
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.472313e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.180220e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.180220e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.524676e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.096044e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.096044e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.504857 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,064,539,289      cycles                           #    2.862 GHz                    
-     3,123,566,672      instructions                     #    1.51  insn per cycle         
-       0.778239097 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.635613 sec
+     2,494,153,441      cycles                           #    2.830 GHz                       
+     3,899,501,791      instructions                     #    1.56  insn per cycle            
+       0.938261269 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 104
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.683325e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.341961e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.341961e+06                 )  sec^-1
-MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.833212 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,141,452,514      cycles                           #    2.889 GHz                    
-     4,965,295,428      instructions                     #    1.58  insn per cycle         
-       1.145190233 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213684418649
-Relative difference = 4.469239988637851e-07
+Avg ME (F77/GPU)   = 1.4131213684418646
+Relative difference = 4.4692399902091566e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.439308e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.451643e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.451643e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.329451e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.340969e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.340969e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.745227 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    19,922,326,116      cycles                           #    2.952 GHz                    
-    59,921,657,661      instructions                     #    3.01  insn per cycle         
-       6.749767217 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1199) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.063608 sec
+    20,321,926,121      cycles                           #    2.876 GHz                       
+    59,999,205,151      instructions                     #    2.95  insn per cycle            
+       7.068112656 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684432433
-Relative difference = 4.46923023397472e-07
+Avg ME (F77/C++)    = 1.4131213684432431
+Relative difference = 4.4692302355460254e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.590762e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.634359e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.634359e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.221277e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.259857e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.259857e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.596308 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,606,556,243      cycles                           #    2.946 GHz                    
-    31,132,640,347      instructions                     #    2.94  insn per cycle         
-       3.600784290 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5221) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.910772 sec
+    11,246,346,443      cycles                           #    2.873 GHz                       
+    32,099,028,986      instructions                     #    2.85  insn per cycle            
+       3.915133075 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5777) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684432433
-Relative difference = 4.46923023397472e-07
+Avg ME (F77/C++)    = 1.4131213684432435
+Relative difference = 4.4692302324034146e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.045361e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.212711e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.212711e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.480438e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.636114e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.636114e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.840181 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,054,435,549      cycles                           #    2.741 GHz                    
-    11,457,891,523      instructions                     #    2.27  insn per cycle         
-       1.844724432 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4642) (512y:    0) (512z:    0)
+TOTAL       :     1.962684 sec
+     5,238,103,974      cycles                           #    2.664 GHz                       
+    11,982,359,006      instructions                     #    2.29  insn per cycle            
+       1.967119077 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4713) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.028589e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.049854e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.049854e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.303300e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.490816e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.490816e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.621206 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,484,828,096      cycles                           #    2.760 GHz                    
-    10,715,944,638      instructions                     #    2.39  insn per cycle         
-       1.625802151 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4378) (512y:   92) (512z:    0)
+TOTAL       :     1.791254 sec
+     4,799,685,272      cycles                           #    2.674 GHz                       
+    11,333,725,028      instructions                     #    2.36  insn per cycle            
+       1.795648986 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4515) (512y:   55) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.165257e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.268564e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.268564e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.945215e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.049356e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.049356e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.316443 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,162,925,866      cycles                           #    1.795 GHz                    
-     6,008,954,577      instructions                     #    1.44  insn per cycle         
-       2.321140123 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1620) (512y:   94) (512z: 3577)
+TOTAL       :     2.391856 sec
+     4,105,469,145      cycles                           #    1.714 GHz                       
+     6,023,724,588      instructions                     #    1.47  insn per cycle            
+       2.396293037 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1486) (512y:   61) (512z: 4162)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684416484
-Relative difference = 4.469241520660492e-07
+Avg ME (F77/C++)    = 1.4131213684416482
+Relative difference = 4.4692415222317974e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
index 49402063e2..d309d70a53 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:00:40
+DATE: 2025-09-24_07:48:50
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.575064e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.921304e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.028957e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.089398e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.117332e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.119247e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.477703 sec
-INFO: No Floating Point Exceptions have been reported
-     1,994,590,518      cycles                           #    2.865 GHz                    
-     2,848,992,929      instructions                     #    1.43  insn per cycle         
-       0.754407053 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.610175 sec
+     2,428,893,625      cycles                           #    2.825 GHz                       
+     3,658,657,722      instructions                     #    1.51  insn per cycle            
+       0.917445904 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.042325e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.231825e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.242712e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.616679 sec
-INFO: No Floating Point Exceptions have been reported
-     2,463,746,118      cycles                           #    2.874 GHz                    
-     3,716,874,386      instructions                     #    1.51  insn per cycle         
-       0.917442132 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 100
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 48
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213684418649
-Relative difference = 4.469239988637851e-07
+Avg ME (F77/GPU)   = 1.4131213684418646
+Relative difference = 4.4692399902091566e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.437110e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.449363e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.449363e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.346165e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.357743e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.357743e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.744189 sec
-INFO: No Floating Point Exceptions have been reported
-    19,899,963,729      cycles                           #    2.950 GHz                    
-    60,130,622,589      instructions                     #    3.02  insn per cycle         
-       6.748077481 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1322) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.004658 sec
+    20,124,920,564      cycles                           #    2.872 GHz                       
+    59,756,165,398      instructions                     #    2.97  insn per cycle            
+       7.008608396 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1376) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684432433
-Relative difference = 4.46923023397472e-07
+Avg ME (F77/C++)    = 1.4131213684432431
+Relative difference = 4.4692302355460254e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.632122e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.676125e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.676125e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.250499e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.289052e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.289052e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.557509 sec
-INFO: No Floating Point Exceptions have been reported
-    10,482,296,489      cycles                           #    2.944 GHz                    
-    30,686,942,862      instructions                     #    2.93  insn per cycle         
-       3.561419011 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5047) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.874731 sec
+    11,138,454,129      cycles                           #    2.872 GHz                       
+    32,036,455,579      instructions                     #    2.88  insn per cycle            
+       3.878707170 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5772) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684432433
-Relative difference = 4.46923023397472e-07
+Avg ME (F77/C++)    = 1.4131213684432435
+Relative difference = 4.4692302324034146e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.842314e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.999775e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.999775e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.469803e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.622290e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.622290e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.874706 sec
-INFO: No Floating Point Exceptions have been reported
-     5,138,957,277      cycles                           #    2.738 GHz                    
-    11,840,408,683      instructions                     #    2.30  insn per cycle         
-       1.878700358 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4746) (512y:    0) (512z:    0)
+TOTAL       :     1.955145 sec
+     5,212,436,941      cycles                           #    2.662 GHz                       
+    11,925,216,027      instructions                     #    2.29  insn per cycle            
+       1.959313231 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4695) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.602387e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.789550e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.789550e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.342400e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.527508e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.527508e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.727070 sec
-INFO: No Floating Point Exceptions have been reported
-     4,726,480,466      cycles                           #    2.731 GHz                    
-    11,165,052,550      instructions                     #    2.36  insn per cycle         
-       1.731070886 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4403) (512y:  246) (512z:    0)
+TOTAL       :     1.774348 sec
+     4,749,629,736      cycles                           #    2.673 GHz                       
+    11,276,525,204      instructions                     #    2.37  insn per cycle            
+       1.778345112 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4495) (512y:   55) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213684416466
 Relative difference = 4.469241533230934e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.101185e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.203049e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.203049e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.953876e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.057374e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.057374e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.329881 sec
-INFO: No Floating Point Exceptions have been reported
-     4,155,200,887      cycles                           #    1.781 GHz                    
-     6,223,800,996      instructions                     #    1.50  insn per cycle         
-       2.334090572 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1516) (512y:  139) (512z: 3679)
+TOTAL       :     2.378867 sec
+     4,058,382,636      cycles                           #    1.704 GHz                       
+     5,981,306,817      instructions                     #    1.47  insn per cycle            
+       2.382982664 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1466) (512y:   61) (512z: 4162)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213684416484
-Relative difference = 4.469241520660492e-07
+Avg ME (F77/C++)    = 1.4131213684416482
+Relative difference = 4.4692415222317974e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..7b10b5caca
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-09-24_08:18:42
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.269366e+05    1 256
+2.583265e+05    2 256
+4.800203e+05    4 256
+8.615029e+05    8 256
+1.454960e+06   16 256
+2.424968e+06   32 256
+2.687996e+06   64 256
+2.634524e+06  128 256
+2.688194e+06  256 256
+2.801637e+06  512 256
+2.875295e+06 1024 256
+### GPU: scaling test 32
+1.788863e+04    1  32
+3.678728e+04    2  32
+7.099492e+04    4  32
+1.431307e+05    8  32
+2.693658e+05   16  32
+5.634733e+05   32  32
+8.933206e+05   64  32
+1.596539e+06  128  32
+2.441699e+06  256  32
+2.725349e+06  512  32
+2.570366e+06 1024  32
+2.561991e+06 2048  32
+2.633165e+06 4096  32
+2.693494e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.398985e+04    1 256
+2.291356e+04    2 256
+2.398329e+04    4 256
+### CPU: scaling test 32
+2.294231e+04    1  32
+2.313364e+04    2  32
+2.339922e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.662482e+04    1 256
+7.711438e+04    2 256
+7.766647e+04    4 256
+### CPU: scaling test 32
+6.733199e+04    1  32
+6.569068e+04    2  32
+7.396390e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.564092e+05    1 256
+1.540938e+05    2 256
+1.563867e+05    4 256
+### CPU: scaling test 32
+1.695939e+05    1  32
+1.711134e+05    2  32
+1.550985e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.839907e+05    1 256
+1.684652e+05    2 256
+1.765554e+05    4 256
+### CPU: scaling test 32
+1.723748e+05    1  32
+1.697352e+05    2  32
+1.712223e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.432731e+05    1 256
+1.455285e+05    2 256
+1.430022e+05    4 256
+### CPU: scaling test 32
+1.408587e+05    1  32
+1.474549e+05    2  32
+1.416650e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
index b4d9344f80..c3559c41cb 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:01:57
+DATE: 2025-09-24_07:50:39
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.641235e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.015793e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.057654e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.614171e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.701563e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.709156e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.457409 sec
-INFO: No Floating Point Exceptions have been reported
-     1,937,244,275      cycles                           #    2.867 GHz                    
-     2,710,892,637      instructions                     #    1.40  insn per cycle         
-       0.733854811 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.520748 sec
+     2,165,328,196      cycles                           #    2.822 GHz                       
+     3,063,009,682      instructions                     #    1.41  insn per cycle            
+       0.829100323 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.672412e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.384843e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.427387e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.630097e+02 +- 4.770717e+02 )  GeV^-2
-TOTAL       :     0.509900 sec
-INFO: No Floating Point Exceptions have been reported
-     2,162,696,786      cycles                           #    2.871 GHz                    
-     3,100,226,347      instructions                     #    1.43  insn per cycle         
-       0.811215095 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 70
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.412607e+00
-Avg ME (F77/GPU)   = 1.4132214305330990
-Relative difference = 0.0004349621183379836
+Avg ME (F77/GPU)   = 1.4132215053590471
+Relative difference = 0.0004350150884479323
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.513642e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.526564e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.526564e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.537186 sec
-INFO: No Floating Point Exceptions have been reported
-    19,278,711,706      cycles                           #    2.948 GHz                    
-    59,616,757,005      instructions                     #    3.09  insn per cycle         
-       6.541004954 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  959) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.396916e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.409518e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.409518e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002644e+01 )  GeV^-2
+TOTAL       :     6.855048 sec
+    19,662,700,355      cycles                           #    2.867 GHz                       
+    60,454,183,722      instructions                     #    3.07  insn per cycle            
+       6.859049729 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1323) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129949096991936
-Relative difference = 6.390737857384068e-08
+Avg ME (F77/C++)    = 1.4129950871652284
+Relative difference = 6.168827799708488e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.120315e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.259615e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.259615e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.735130e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.868621e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.868621e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     2.035209 sec
-INFO: No Floating Point Exceptions have been reported
-     6,010,527,138      cycles                           #    2.949 GHz                    
-    17,061,942,080      instructions                     #    2.84  insn per cycle         
-       2.038918474 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.136971 sec
+     6,147,248,224      cycles                           #    2.873 GHz                       
+    17,502,229,798      instructions                     #    2.85  insn per cycle            
+       2.140918572 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6412) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129954647353316
-Relative difference = 3.2890090308261873e-07
+Avg ME (F77/C++)    = 1.4129953611754331
+Relative difference = 2.5560984512808326e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.748972e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.811746e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.811746e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.633695e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.690243e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.690243e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.954915 sec
-INFO: No Floating Point Exceptions have been reported
-     2,640,169,352      cycles                           #    2.756 GHz                    
-     6,187,458,591      instructions                     #    2.34  insn per cycle         
-       0.958678404 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5105) (512y:    0) (512z:    0)
+TOTAL       :     1.021676 sec
+     2,743,963,979      cycles                           #    2.678 GHz                       
+     6,444,168,368      instructions                     #    2.35  insn per cycle            
+       1.025607032 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5176) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133133105791558
+Relative difference = 2.197525641713777e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.923079e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.998771e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.998771e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.786119e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.853837e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.853837e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.870044 sec
-INFO: No Floating Point Exceptions have been reported
-     2,402,321,989      cycles                           #    2.751 GHz                    
-     5,790,080,813      instructions                     #    2.41  insn per cycle         
-       0.873863245 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4906) (512y:   37) (512z:    0)
+TOTAL       :     0.935888 sec
+     2,515,071,638      cycles                           #    2.678 GHz                       
+     6,112,307,770      instructions                     #    2.43  insn per cycle            
+       0.939956946 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5042) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133133105791558
+Relative difference = 2.197525641713777e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.455132e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.498332e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.498332e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.419401e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.462974e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.462974e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.146003 sec
-INFO: No Floating Point Exceptions have been reported
-     2,072,911,951      cycles                           #    1.804 GHz                    
-     3,391,607,808      instructions                     #    1.64  insn per cycle         
-       1.149850121 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2237) (512y:   37) (512z: 3789)
+TOTAL       :     1.175080 sec
+     2,043,785,731      cycles                           #    1.734 GHz                       
+     3,407,766,286      instructions                     #    1.67  insn per cycle            
+       1.179120501 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2112) (512y:    5) (512z: 4366)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413316e+00
-Avg ME (F77/C++)    = 1.4133164033579249
-Relative difference = 2.85398258307829e-07
+Avg ME (F77/C++)    = 1.4133163777514426
+Relative difference = 2.672802420482638e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
index 89f1af02c0..0e323dea81 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,275 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:38:49
+DATE: 2025-09-24_08:57:00
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.430077e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.496267e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.496267e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 1.009071e+02 +- 5.002295e+01 )  GeV^-2
-TOTAL       :     0.468595 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,993,529,576      cycles                           #    2.878 GHz                    
-     2,894,144,626      instructions                     #    1.45  insn per cycle         
-       0.749153323 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+EvtsPerSec[Rmb+ME]     (23) = ( 1.804908e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.563811e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.563811e+06                 )  sec^-1
+MeanMatrixElemValue         = ( 1.009070e+02 +- 5.002294e+01 )  GeV^-2
+TOTAL       :     0.529866 sec
+     2,189,527,812      cycles                           #    2.825 GHz                       
+     3,234,000,738      instructions                     #    1.48  insn per cycle            
+       0.832340190 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 70
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.508973e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.254431e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.254431e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.737499e+02 +- 4.776369e+02 )  GeV^-2
-TOTAL       :     0.658615 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,580,648,777      cycles                           #    2.888 GHz                    
-     3,894,936,658      instructions                     #    1.51  insn per cycle         
-       0.952346890 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.412607e+00
-Avg ME (F77/GPU)   = 1.4132214305330990
-Relative difference = 0.0004349621183379836
+Avg ME (F77/GPU)   = 1.4132215053590471
+Relative difference = 0.0004350150884479323
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.506368e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.519408e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.519408e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.559975 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    19,280,512,048      cycles                           #    2.938 GHz                    
-    59,619,141,119      instructions                     #    3.09  insn per cycle         
-       6.564243260 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  959) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.397563e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.409734e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.409734e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002644e+01 )  GeV^-2
+TOTAL       :     6.857624 sec
+    19,700,274,444      cycles                           #    2.872 GHz                       
+    60,456,884,845      instructions                     #    3.07  insn per cycle            
+       6.861762459 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1323) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129949096991936
-Relative difference = 6.390737857384068e-08
+Avg ME (F77/C++)    = 1.4129950871652284
+Relative difference = 6.168827799708488e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.092271e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.230160e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.230160e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.707084e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.840722e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.840722e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     2.047307 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,043,775,465      cycles                           #    2.947 GHz                    
-    17,111,089,922      instructions                     #    2.83  insn per cycle         
-       2.051614364 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5856) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.149611 sec
+     6,176,579,184      cycles                           #    2.869 GHz                       
+    17,549,611,255      instructions                     #    2.84  insn per cycle            
+       2.153671487 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6412) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129954647353316
-Relative difference = 3.2890090308261873e-07
+Avg ME (F77/C++)    = 1.4129953611754331
+Relative difference = 2.5560984512808326e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.748354e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.809701e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.809701e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.618402e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.675763e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.675763e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.959425 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,659,679,748      cycles                           #    2.761 GHz                    
-     6,224,393,438      instructions                     #    2.34  insn per cycle         
-       0.963869172 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5105) (512y:    0) (512z:    0)
+TOTAL       :     1.036238 sec
+     2,770,490,321      cycles                           #    2.665 GHz                       
+     6,480,361,312      instructions                     #    2.34  insn per cycle            
+       1.040373935 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5176) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133133105791558
+Relative difference = 2.197525641713777e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927524e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.002486e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.002486e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.776625e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.846255e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.846255e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     0.872058 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,421,094,163      cycles                           #    2.765 GHz                    
-     5,826,830,021      instructions                     #    2.41  insn per cycle         
-       0.876372578 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4906) (512y:   37) (512z:    0)
+TOTAL       :     0.946083 sec
+     2,541,484,609      cycles                           #    2.676 GHz                       
+     6,148,267,017      instructions                     #    2.42  insn per cycle            
+       0.950364845 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5042) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133133105791558
+Relative difference = 2.197525641713777e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.443486e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.486864e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.486864e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.412409e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.456214e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.456214e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.160150 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,098,432,349      cycles                           #    1.804 GHz                    
-     3,433,067,927      instructions                     #    1.64  insn per cycle         
-       1.164579445 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2237) (512y:   37) (512z: 3789)
+TOTAL       :     1.186787 sec
+     2,071,054,443      cycles                           #    1.740 GHz                       
+     3,448,257,390      instructions                     #    1.66  insn per cycle            
+       1.191043055 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2112) (512y:    5) (512z: 4366)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413316e+00
-Avg ME (F77/C++)    = 1.4133164033579249
-Relative difference = 2.85398258307829e-07
+Avg ME (F77/C++)    = 1.4133163777514426
+Relative difference = 2.672802420482638e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
index 7537d3c84d..09a2e62956 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:02:18
+DATE: 2025-09-24_07:51:10
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.658659e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.027503e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.066373e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.611135e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.699057e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.706506e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008472e+02 +- 5.002447e+01 )  GeV^-2
-TOTAL       :     0.462988 sec
-INFO: No Floating Point Exceptions have been reported
-     1,956,715,427      cycles                           #    2.872 GHz                    
-     2,757,694,861      instructions                     #    1.41  insn per cycle         
-       0.742544959 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 226
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.520881 sec
+     2,178,752,390      cycles                           #    2.830 GHz                       
+     3,100,668,964      instructions                     #    1.42  insn per cycle            
+       0.830191295 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.669827e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.371215e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.415741e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.630097e+02 +- 4.770717e+02 )  GeV^-2
-TOTAL       :     0.505811 sec
-INFO: No Floating Point Exceptions have been reported
-     2,123,611,289      cycles                           #    2.883 GHz                    
-     3,083,974,467      instructions                     #    1.45  insn per cycle         
-       0.793454464 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 70
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.412607e+00
-Avg ME (F77/GPU)   = 1.4132214305330990
-Relative difference = 0.0004349621183379836
+Avg ME (F77/GPU)   = 1.4132215053590471
+Relative difference = 0.0004350150884479323
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.488365e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.501255e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.501255e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     6.602997 sec
-INFO: No Floating Point Exceptions have been reported
-    19,409,400,884      cycles                           #    2.938 GHz                    
-    59,351,848,666      instructions                     #    3.06  insn per cycle         
-       6.606759387 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1027) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.400136e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.412501e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.412501e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002644e+01 )  GeV^-2
+TOTAL       :     6.845560 sec
+    19,647,886,375      cycles                           #    2.869 GHz                       
+    60,294,728,788      instructions                     #    3.07  insn per cycle            
+       6.849528810 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1290) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129949096991936
-Relative difference = 6.390737857384068e-08
+Avg ME (F77/C++)    = 1.4129950871652284
+Relative difference = 6.168827799708488e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.484090e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.633368e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.633368e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.744896e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.876711e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.876711e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.009236e+02 +- 5.002643e+01 )  GeV^-2
-TOTAL       :     1.948837 sec
-INFO: No Floating Point Exceptions have been reported
-     5,764,162,956      cycles                           #    2.953 GHz                    
-    16,849,716,772      instructions                     #    2.92  insn per cycle         
-       1.952678468 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5610) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.133454 sec
+     6,138,416,779      cycles                           #    2.873 GHz                       
+    17,495,238,901      instructions                     #    2.85  insn per cycle            
+       2.137501353 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6406) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.412995e+00
-Avg ME (F77/C++)    = 1.4129954647353316
-Relative difference = 3.2890090308261873e-07
+Avg ME (F77/C++)    = 1.4129953611754331
+Relative difference = 2.5560984512808326e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.522405e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.569181e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.569181e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.626225e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.681915e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.681915e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.094041 sec
-INFO: No Floating Point Exceptions have been reported
-     3,018,102,108      cycles                           #    2.750 GHz                    
-     6,848,568,360      instructions                     #    2.27  insn per cycle         
-       1.098202042 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5735) (512y:    0) (512z:    0)
+TOTAL       :     1.026128 sec
+     2,752,520,788      cycles                           #    2.674 GHz                       
+     6,440,444,830      instructions                     #    2.34  insn per cycle            
+       1.030155534 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5154) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133133105791558
+Relative difference = 2.197525641713777e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.654265e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.710055e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.710055e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.754480e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.821417e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.821417e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008857e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.008735 sec
-INFO: No Floating Point Exceptions have been reported
-     2,794,533,058      cycles                           #    2.762 GHz                    
-     6,437,695,564      instructions                     #    2.30  insn per cycle         
-       1.012558685 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5509) (512y:   23) (512z:    0)
+TOTAL       :     0.952423 sec
+     2,524,001,981      cycles                           #    2.641 GHz                       
+     6,108,556,264      instructions                     #    2.42  insn per cycle            
+       0.956460804 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5018) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413313e+00
-Avg ME (F77/C++)    = 1.4133132969790267
-Relative difference = 2.1012969292986113e-07
+Avg ME (F77/C++)    = 1.4133133105791558
+Relative difference = 2.197525641713777e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.323435e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.360072e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.360072e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.412561e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.455444e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.455444e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008856e+02 +- 5.002468e+01 )  GeV^-2
-TOTAL       :     1.258302 sec
-INFO: No Floating Point Exceptions have been reported
-     2,251,923,496      cycles                           #    1.787 GHz                    
-     3,755,291,572      instructions                     #    1.67  insn per cycle         
-       1.262174564 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2467) (512y:   28) (512z: 4084)
+TOTAL       :     1.180491 sec
+     2,041,439,261      cycles                           #    1.725 GHz                       
+     3,405,273,640      instructions                     #    1.67  insn per cycle            
+       1.184475245 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2088) (512y:    5) (512z: 4366)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413316e+00
-Avg ME (F77/C++)    = 1.4133164033579249
-Relative difference = 2.85398258307829e-07
+Avg ME (F77/C++)    = 1.4133163777514426
+Relative difference = 2.672802420482638e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..c595edebb2
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+
+DATE: 2025-09-24_08:18:19
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+1.142195e+05    1 256
+2.043397e+05    2 256
+3.835250e+05    4 256
+7.446403e+05    8 256
+1.295320e+06   16 256
+1.336801e+06   32 256
+1.135266e+06   64 256
+1.147914e+06  128 256
+1.151761e+06  256 256
+1.173317e+06  512 256
+1.173181e+06 1024 256
+### GPU: scaling test 32
+1.750501e+04    1  32
+3.464900e+04    2  32
+6.628367e+04    4  32
+1.319968e+05    8  32
+2.510299e+05   16  32
+4.434520e+05   32  32
+7.899604e+05   64  32
+1.287919e+06  128  32
+1.372098e+06  256  32
+1.120965e+06  512  32
+1.117757e+06 1024  32
+1.107063e+06 2048  32
+1.136625e+06 4096  32
+1.131581e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.271106e+04    1 256
+2.290369e+04    2 256
+2.311083e+04    4 256
+### CPU: scaling test 32
+2.200591e+04    1  32
+2.184564e+04    2  32
+2.090239e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.213590e+04    1 256
+4.313308e+04    2 256
+4.335077e+04    4 256
+### CPU: scaling test 32
+4.068659e+04    1  32
+4.056430e+04    2  32
+4.125233e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.011697e+04    1 256
+8.566827e+04    2 256
+8.574349e+04    4 256
+### CPU: scaling test 32
+7.598478e+04    1  32
+8.088826e+04    2  32
+8.164364e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.891285e+04    1 256
+8.920943e+04    2 256
+9.219800e+04    4 256
+### CPU: scaling test 32
+9.814686e+04    1  32
+9.351694e+04    2  32
+8.970697e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.059681e+04    1 256
+7.092777e+04    2 256
+7.113395e+04    4 256
+### CPU: scaling test 32
+6.963531e+04    1  32
+7.054720e+04    2  32
+7.077145e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
index 6b4617ba56..87f352e7da 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:01:06
+DATE: 2025-09-24_07:49:26
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.531107e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.896113e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.014318e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.111499e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.140352e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.142296e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.482291 sec
-INFO: No Floating Point Exceptions have been reported
-     1,996,726,100      cycles                           #    2.869 GHz                    
-     2,875,927,393      instructions                     #    1.44  insn per cycle         
-       0.757518934 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.606172 sec
+     2,425,493,225      cycles                           #    2.823 GHz                       
+     3,641,776,012      instructions                     #    1.50  insn per cycle            
+       0.915893126 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.039985e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.227093e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.238483e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.618226 sec
-INFO: No Floating Point Exceptions have been reported
-     2,476,524,825      cycles                           #    2.883 GHz                    
-     3,787,822,568      instructions                     #    1.53  insn per cycle         
-       0.918414719 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 104
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 54
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213755569487
-Relative difference = 4.418889885423659e-07
+Avg ME (F77/GPU)   = 1.4131213823321340
+Relative difference = 4.3709450844674974e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.396101e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.408087e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.408087e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.285953e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.297043e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.297043e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.859626 sec
-INFO: No Floating Point Exceptions have been reported
-    20,206,369,377      cycles                           #    2.945 GHz                    
-    60,950,595,896      instructions                     #    3.02  insn per cycle         
-       6.863727850 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1220) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.189052 sec
+    20,678,030,694      cycles                           #    2.875 GHz                       
+    61,089,007,408      instructions                     #    2.95  insn per cycle            
+       7.193260515 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213859069593
 Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.651759e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.695029e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.695029e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.317569e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.357386e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.357386e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.542669 sec
-INFO: No Floating Point Exceptions have been reported
-    10,470,195,857      cycles                           #    2.953 GHz                    
-    30,822,635,750      instructions                     #    2.94  insn per cycle         
-       3.546724112 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5351) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.815341 sec
+    10,965,720,375      cycles                           #    2.872 GHz                       
+    31,668,670,846      instructions                     #    2.89  insn per cycle            
+       3.819469924 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5888) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213792564823
-Relative difference = 4.392710025734405e-07
+Avg ME (F77/C++)    = 1.4131213813302705
+Relative difference = 4.3780348012864624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.177717e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.345070e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.345070e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.640391e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.799298e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.799298e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.805877 sec
-INFO: No Floating Point Exceptions have been reported
-     4,960,900,655      cycles                           #    2.742 GHz                    
-    11,360,293,322      instructions                     #    2.29  insn per cycle         
-       1.809915904 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4776) (512y:    0) (512z:    0)
+TOTAL       :     1.916985 sec
+     5,119,164,674      cycles                           #    2.666 GHz                       
+    11,787,953,551      instructions                     #    2.30  insn per cycle            
+       1.921105796 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4783) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.047166e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.068679e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.068679e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.502267e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.693578e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.693578e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.585052 sec
-INFO: No Floating Point Exceptions have been reported
-     4,379,448,731      cycles                           #    2.757 GHz                    
-    10,610,063,505      instructions                     #    2.42  insn per cycle         
-       1.588995755 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4503) (512y:   84) (512z:    0)
+TOTAL       :     1.744888 sec
+     4,669,653,692      cycles                           #    2.671 GHz                       
+    11,149,149,838      instructions                     #    2.39  insn per cycle            
+       1.748994962 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4604) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.890582e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.987179e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.987179e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.008286e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.114228e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.114228e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.401138 sec
-INFO: No Floating Point Exceptions have been reported
-     4,243,505,288      cycles                           #    1.765 GHz                    
-     6,171,567,257      instructions                     #    1.45  insn per cycle         
-       2.405218093 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2143) (512y:  116) (512z: 3653)
+TOTAL       :     2.360776 sec
+     4,029,709,616      cycles                           #    1.705 GHz                       
+     5,980,013,055      instructions                     #    1.48  insn per cycle            
+       2.365003134 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1534) (512y:   61) (512z: 4253)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213786174055
 Relative difference = 4.3972324717191576e-07
diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
index 1a268fb0a6..b29ebb2181 100644
--- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg'
 
-DATE: 2024-10-06_09:01:31
+DATE: 2025-09-24_07:50:02
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.506525e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.876419e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.986419e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.084424e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.111897e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.113751e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     0.475723 sec
-INFO: No Floating Point Exceptions have been reported
-     1,989,777,196      cycles                           #    2.876 GHz                    
-     2,865,221,599      instructions                     #    1.44  insn per cycle         
-       0.750464789 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.608218 sec
+     2,430,751,971      cycles                           #    2.832 GHz                       
+     3,656,805,473      instructions                     #    1.50  insn per cycle            
+       0.915307420 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.040967e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.229706e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.240646e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 6.734461e+02 +- 4.775415e+02 )  GeV^-2
-TOTAL       :     0.612359 sec
-INFO: No Floating Point Exceptions have been reported
-     2,465,408,367      cycles                           #    2.885 GHz                    
-     3,759,784,229      instructions                     #    1.53  insn per cycle         
-       0.914073870 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 100
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 54
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.413122e+00
-Avg ME (F77/GPU)   = 1.4131213755569487
-Relative difference = 4.418889885423659e-07
+Avg ME (F77/GPU)   = 1.4131213823321340
+Relative difference = 4.3709450844674974e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.395973e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.407808e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.407808e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.272960e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.283884e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.283884e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     6.859771 sec
-INFO: No Floating Point Exceptions have been reported
-    20,239,178,144      cycles                           #    2.949 GHz                    
-    61,173,779,461      instructions                     #    3.02  insn per cycle         
-       6.863706451 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1272) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.229703 sec
+    20,531,510,190      cycles                           #    2.839 GHz                       
+    60,852,281,799      instructions                     #    2.96  insn per cycle            
+       7.233768348 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1376) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213859069593
 Relative difference = 4.345647726386255e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.702334e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.747762e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.747762e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.310001e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.349804e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.349804e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     3.505938 sec
-INFO: No Floating Point Exceptions have been reported
-    10,333,154,234      cycles                           #    2.946 GHz                    
-    30,534,348,115      instructions                     #    2.95  insn per cycle         
-       3.510016853 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 5155) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.821568 sec
+    10,990,977,687      cycles                           #    2.874 GHz                       
+    31,654,070,133      instructions                     #    2.88  insn per cycle            
+       3.825620122 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5883) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213792564823
-Relative difference = 4.392710025734405e-07
+Avg ME (F77/C++)    = 1.4131213813302705
+Relative difference = 4.3780348012864624e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.861323e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.018375e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.018375e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.617602e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.775102e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.775102e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.870783 sec
-INFO: No Floating Point Exceptions have been reported
-     5,160,894,050      cycles                           #    2.755 GHz                    
-    11,875,310,688      instructions                     #    2.30  insn per cycle         
-       1.874839635 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4887) (512y:    0) (512z:    0)
+TOTAL       :     1.922108 sec
+     5,136,210,329      cycles                           #    2.668 GHz                       
+    11,781,523,745      instructions                     #    2.29  insn per cycle            
+       1.926086319 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4765) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.768245e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.957717e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.957717e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.469057e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.657949e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.657949e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     1.697611 sec
-INFO: No Floating Point Exceptions have been reported
-     4,679,050,155      cycles                           #    2.751 GHz                    
-    11,168,862,734      instructions                     #    2.39  insn per cycle         
-       1.701628470 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4508) (512y:  239) (512z:    0)
+TOTAL       :     1.750928 sec
+     4,685,020,284      cycles                           #    2.671 GHz                       
+    11,142,278,840      instructions                     #    2.38  insn per cycle            
+       1.755079681 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4584) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
-Avg ME (F77/C++)    = 1.4131213600217192
-Relative difference = 4.5288254008796884e-07
+Avg ME (F77/C++)    = 1.4131213646773610
+Relative difference = 4.495879612249832e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GG_TTXG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.922687e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.020028e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.020028e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.046477e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.153047e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.153047e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 1.008920e+02 +- 5.001681e+01 )  GeV^-2
-TOTAL       :     2.390116 sec
-INFO: No Floating Point Exceptions have been reported
-     4,256,907,095      cycles                           #    1.778 GHz                    
-     6,411,350,564      instructions                     #    1.51  insn per cycle         
-       2.394737171 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2039) (512y:  162) (512z: 3731)
+TOTAL       :     2.348086 sec
+     4,023,618,226      cycles                           #    1.712 GHz                       
+     5,976,028,235      instructions                     #    1.49  insn per cycle            
+       2.352309565 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1514) (512y:   61) (512z: 4253)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 16 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 16 channels { 1 : 64, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.413122e+00
 Avg ME (F77/C++)    = 1.4131213786174055
 Relative difference = 4.3972324717191576e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..3c6d7aa243
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:19:04
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.827501e+03    1 256
+9.434129e+03    2 256
+1.859154e+04    4 256
+3.692058e+04    8 256
+7.253540e+04   16 256
+1.151504e+05   32 256
+1.046277e+05   64 256
+1.035386e+05  128 256
+1.074867e+05  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+6.220421e+02    1  32
+1.240495e+03    2  32
+2.460087e+03    4  32
+4.886577e+03    8  32
+9.554076e+03   16  32
+1.877130e+04   32  32
+3.724483e+04   64  32
+7.268310e+04  128  32
+1.157858e+05  256  32
+1.040650e+05  512  32
+1.010411e+05 1024  32
+1.041504e+05 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.780453e+03    1 256
+1.786081e+03    2 256
+1.788120e+03    4 256
+### CPU: scaling test 32
+1.774852e+03    1  32
+1.777928e+03    2  32
+1.779310e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.203589e+03    1 256
+3.221417e+03    2 256
+3.235841e+03    4 256
+### CPU: scaling test 32
+3.128414e+03    1  32
+3.152927e+03    2  32
+3.182204e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.284645e+03    1 256
+7.272752e+03    2 256
+7.332492e+03    4 256
+### CPU: scaling test 32
+6.902093e+03    1  32
+6.845651e+03    2  32
+6.849626e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.152222e+03    1 256
+8.186030e+03    2 256
+8.253627e+03    4 256
+### CPU: scaling test 32
+8.324421e+03    1  32
+7.984412e+03    2  32
+7.747794e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.579293e+03    1 256
+6.584524e+03    2 256
+6.644529e+03    4 256
+### CPU: scaling test 32
+6.545181e+03    1  32
+6.515841e+03    2  32
+6.481920e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
index fe9e9669c6..445a870ebb 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:02:40
+DATE: 2025-09-24_07:51:45
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.331120e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.359202e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.361250e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.042151e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.045167e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.045391e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.543918 sec
-INFO: No Floating Point Exceptions have been reported
-     2,225,694,406      cycles                           #    2.884 GHz                    
-     3,483,451,829      instructions                     #    1.57  insn per cycle         
-       0.837015502 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.657631 sec
+     2,689,336,386      cycles                           #    2.825 GHz                       
+     4,292,001,683      instructions                     #    1.60  insn per cycle            
+       1.009047849 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134422e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.164730e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.165914e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.052190 sec
-INFO: No Floating Point Exceptions have been reported
-     9,689,726,748      cycles                           #    2.928 GHz                    
-    22,118,867,491      instructions                     #    2.28  insn per cycle         
-       3.368998161 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.884002e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.884932e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.884932e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.775467e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.776300e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.776300e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.713540 sec
-INFO: No Floating Point Exceptions have been reported
-    25,683,805,881      cycles                           #    2.947 GHz                    
-    78,963,253,936      instructions                     #    3.07  insn per cycle         
-       8.717598721 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.243774 sec
+    26,559,464,941      cycles                           #    2.873 GHz                       
+    80,664,436,977      instructions                     #    3.04  insn per cycle            
+       9.247966768 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141133E-004
-Relative difference = 2.8372990776517314e-07
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.540501e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.543820e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.543820e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.220602e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.223269e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.223269e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.639000 sec
-INFO: No Floating Point Exceptions have been reported
-    13,090,618,968      cycles                           #    2.820 GHz                    
-    39,561,040,325      instructions                     #    3.02  insn per cycle         
-       4.644193645 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.098637 sec
+    14,012,498,003      cycles                           #    2.747 GHz                       
+    41,137,206,711      instructions                     #    2.94  insn per cycle            
+       5.102679505 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141122E-004
-Relative difference = 2.837299079287849e-07
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.087246e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.103223e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.103223e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.303668e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.317915e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.317915e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.035017 sec
-INFO: No Floating Point Exceptions have been reported
-     5,608,597,608      cycles                           #    2.752 GHz                    
-    13,825,354,537      instructions                     #    2.47  insn per cycle         
-       2.039075619 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.252099 sec
+     6,012,721,565      cycles                           #    2.666 GHz                       
+    14,678,984,496      instructions                     #    2.44  insn per cycle            
+       2.256373750 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14881) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.190120e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.211201e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.211201e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.188749e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.206799e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.206799e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.791765 sec
-INFO: No Floating Point Exceptions have been reported
-     4,921,067,926      cycles                           #    2.743 GHz                    
-    12,507,200,724      instructions                     #    2.54  insn per cycle         
-       1.798123347 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     2.009058 sec
+     5,331,599,497      cycles                           #    2.649 GHz                       
+    13,558,732,519      instructions                     #    2.54  insn per cycle            
+       2.013187817 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14564) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.012553e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.024911e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.024911e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.537614e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.549163e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.549163e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.347251 sec
-INFO: No Floating Point Exceptions have been reported
-     4,147,263,675      cycles                           #    1.765 GHz                    
-     6,394,266,077      instructions                     #    1.54  insn per cycle         
-       2.352573303 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.515421 sec
+     4,232,878,333      cycles                           #    1.681 GHz                       
+     6,889,867,377      instructions                     #    1.63  insn per cycle            
+       2.519616006 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1767) (512y:   61) (512z:14451)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..abcd7d198a
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:38:25
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.431408e+03    1 256
+8.665003e+03    2 256
+1.697877e+04    4 256
+3.338923e+04    8 256
+6.548357e+04   16 256
+1.044894e+05   32 256
+9.824208e+04   64 256
+9.911505e+04  128 256
+1.037671e+05  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+5.663582e+02    1  32
+1.127328e+03    2  32
+2.249680e+03    4  32
+4.449069e+03    8  32
+8.698388e+03   16  32
+1.716415e+04   32  32
+3.362489e+04   64  32
+6.573394e+04  128  32
+1.057867e+05  256  32
+9.794711e+04  512  32
+9.709625e+04 1024  32
+1.012537e+05 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.785872e+03    1 256
+1.785429e+03    2 256
+1.786204e+03    4 256
+### CPU: scaling test 32
+1.766508e+03    1  32
+1.772286e+03    2  32
+1.775821e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.247136e+03    1 256
+3.244847e+03    2 256
+3.227010e+03    4 256
+### CPU: scaling test 32
+3.152260e+03    1  32
+3.180242e+03    2  32
+3.227514e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.322106e+03    1 256
+7.304087e+03    2 256
+7.292000e+03    4 256
+### CPU: scaling test 32
+6.643951e+03    1  32
+6.794845e+03    2  32
+7.028915e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.142400e+03    1 256
+8.227677e+03    2 256
+8.295040e+03    4 256
+### CPU: scaling test 32
+8.162056e+03    1  32
+8.206971e+03    2  32
+8.075921e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.655188e+03    1 256
+6.639534e+03    2 256
+6.652549e+03    4 256
+### CPU: scaling test 32
+6.476613e+03    1  32
+6.567240e+03    2  32
+6.717274e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..ffa58d7677
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_blasOn.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:33:16
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 9.820656e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.841888e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.843656e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.356954 sec
+     4,918,574,281      cycles                           #    2.839 GHz                       
+     6,948,443,410      instructions                     #    1.41  insn per cycle            
+       1.793979006 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.777070e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.777900e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.777900e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     9.235300 sec
+    26,537,227,132      cycles                           #    2.873 GHz                       
+    80,663,724,754      instructions                     #    3.04  insn per cycle            
+       9.239490689 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.219777e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.222590e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.222590e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     5.099665 sec
+    14,039,633,443      cycles                           #    2.752 GHz                       
+    41,136,919,428      instructions                     #    2.93  insn per cycle            
+       5.103868350 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.313055e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.326878e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.326878e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.249075 sec
+     5,996,219,788      cycles                           #    2.662 GHz                       
+    14,679,465,756      instructions                     #    2.45  insn per cycle            
+       2.253155059 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14881) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.261945e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.280295e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.280295e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.991347 sec
+     5,330,092,584      cycles                           #    2.673 GHz                       
+    13,558,700,933      instructions                     #    2.54  insn per cycle            
+       1.995510774 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14564) (512y:   45) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.545564e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.556815e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.556815e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.512636 sec
+     4,237,115,233      cycles                           #    1.685 GHz                       
+     6,890,325,988      instructions                     #    1.63  insn per cycle            
+       2.516727175 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1767) (512y:   61) (512z:14451)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
index bc0987eea5..72edeef53a 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,220 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:39:37
+DATE: 2025-09-24_08:58:18
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.954093e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.263620e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.263620e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.997819e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.033734e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.033734e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.526732 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,227,837,882      cycles                           #    2.883 GHz                    
-     3,476,505,124      instructions                     #    1.56  insn per cycle         
-       0.832118305 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.650712 sec
+     2,666,339,708      cycles                           #    2.832 GHz                       
+     4,289,519,507      instructions                     #    1.61  insn per cycle            
+       1.001668457 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.643761e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.124122e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.124122e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.301805 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    10,501,615,955      cycles                           #    2.935 GHz                    
-    23,489,948,913      instructions                     #    2.24  insn per cycle         
-       3.634545913 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.879294e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.880182e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.880182e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.777512e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.778338e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.778338e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.737845 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    25,658,913,414      cycles                           #    2.936 GHz                    
-    78,963,594,343      instructions                     #    3.08  insn per cycle         
-       8.742435740 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.237890 sec
+    26,569,728,048      cycles                           #    2.876 GHz                       
+    80,669,309,749      instructions                     #    3.04  insn per cycle            
+       9.242234035 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141133E-004
-Relative difference = 2.8372990776517314e-07
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.518464e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.521735e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.521735e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.302860e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.305691e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.305691e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.671849 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    13,102,544,659      cycles                           #    2.802 GHz                    
-    39,572,381,519      instructions                     #    3.02  insn per cycle         
-       4.676455621 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.976384 sec
+    14,071,437,640      cycles                           #    2.826 GHz                       
+    41,149,571,507      instructions                     #    2.92  insn per cycle            
+       4.980974280 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141122E-004
-Relative difference = 2.837299079287849e-07
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.057114e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.073561e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.073561e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.261515e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.275168e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.275168e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.046600 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,627,314,455      cycles                           #    2.744 GHz                    
-    13,834,298,777      instructions                     #    2.46  insn per cycle         
-       2.051219882 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.269473 sec
+     6,044,083,589      cycles                           #    2.659 GHz                       
+    14,689,689,145      instructions                     #    2.43  insn per cycle            
+       2.274051232 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14881) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.239341e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.261385e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.261385e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.269552e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.288361e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.288361e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.786219 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,931,565,389      cycles                           #    2.756 GHz                    
-    12,515,991,121      instructions                     #    2.54  insn per cycle         
-       1.790909503 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     1.994141 sec
+     5,344,979,439      cycles                           #    2.675 GHz                       
+    13,569,216,111      instructions                     #    2.54  insn per cycle            
+       1.998658815 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14564) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.038188e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.051446e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.051446e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.570403e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.581912e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.581912e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.341272 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,150,945,217      cycles                           #    1.770 GHz                    
-     6,403,675,117      instructions                     #    1.54  insn per cycle         
-       2.345955468 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.507538 sec
+     4,251,930,707      cycles                           #    1.693 GHz                       
+     6,900,759,684      instructions                     #    1.62  insn per cycle            
+       2.512191271 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1767) (512y:   61) (512z:14451)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
index be31042fc1..f735859252 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:50:33
+DATE: 2025-09-24_09:15:41
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.314159e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.339458e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.341417e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.036749e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.039258e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.039455e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     0.518126 sec
-INFO: No Floating Point Exceptions have been reported
-     2,164,802,026      cycles                           #    2.881 GHz                    
-     3,409,915,390      instructions                     #    1.58  insn per cycle         
-       0.811338657 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.641624 sec
+     2,661,941,432      cycles                           #    2.834 GHz                       
+     4,378,783,837      instructions                     #    1.64  insn per cycle            
+       0.996435485 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134613e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.165487e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.166746e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 1.252232e+02 +- 1.234346e+02 )  GeV^-4
-TOTAL       :     3.140406 sec
-INFO: No Floating Point Exceptions have been reported
-     9,973,053,404      cycles                           #    2.934 GHz                    
-    20,986,544,572      instructions                     #    2.10  insn per cycle         
-       3.455765313 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.884135e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.885033e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.885033e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.774602e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.775455e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.775455e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     8.712529 sec
-INFO: No Floating Point Exceptions have been reported
-    25,691,717,185      cycles                           #    2.948 GHz                    
-    78,960,325,856      instructions                     #    3.07  insn per cycle         
-       8.716734440 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.249876 sec
+    26,570,238,887      cycles                           #    2.872 GHz                       
+    80,663,446,059      instructions                     #    3.04  insn per cycle            
+       9.253723929 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141133E-004
-Relative difference = 2.8372990776517314e-07
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.543458e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.546697e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.546697e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.215926e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.218618e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.218618e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     4.636367 sec
-INFO: No Floating Point Exceptions have been reported
-    13,067,183,546      cycles                           #    2.816 GHz                    
-    39,558,454,763      instructions                     #    3.03  insn per cycle         
-       4.640590687 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.107568 sec
+    14,024,060,474      cycles                           #    2.744 GHz                       
+    41,136,719,909      instructions                     #    2.93  insn per cycle            
+       5.111745095 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141122E-004
-Relative difference = 2.837299079287849e-07
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.084806e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.101064e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.101064e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.317059e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.331542e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.331542e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.036679 sec
-INFO: No Floating Point Exceptions have been reported
-     5,613,470,524      cycles                           #    2.752 GHz                    
-    13,823,796,455      instructions                     #    2.46  insn per cycle         
-       2.040900437 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.249705 sec
+     6,011,566,666      cycles                           #    2.669 GHz                       
+    14,678,181,798      instructions                     #    2.44  insn per cycle            
+       2.253767973 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14881) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.198723e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.219905e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.219905e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.063615e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.080699e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.080699e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     1.791160 sec
-INFO: No Floating Point Exceptions have been reported
-     4,922,288,820      cycles                           #    2.743 GHz                    
-    12,503,388,745      instructions                     #    2.54  insn per cycle         
-       1.795321275 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     2.041817 sec
+     5,403,229,519      cycles                           #    2.642 GHz                       
+    13,557,173,256      instructions                     #    2.51  insn per cycle            
+       2.045882374 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14564) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.975365e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.987686e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.987686e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.516082e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.527321e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.527321e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.197467e-01 +- 3.250467e-01 )  GeV^-4
-TOTAL       :     2.359532 sec
-INFO: No Floating Point Exceptions have been reported
-     4,155,009,705      cycles                           #    1.759 GHz                    
-     6,390,945,346      instructions                     #    1.54  insn per cycle         
-       2.363732897 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.525144 sec
+     4,244,374,081      cycles                           #    1.679 GHz                       
+     6,887,965,141      instructions                     #    1.62  insn per cycle            
+       2.529266849 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1767) (512y:   61) (512z:14451)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
index 437b6b7cbd..359d241c33 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:47:41
+DATE: 2025-09-24_09:11:37
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.310053e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.334627e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.336677e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.039836e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.042417e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.042618e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.518612 sec
-INFO: No Floating Point Exceptions have been reported
-     2,156,837,380      cycles                           #    2.875 GHz                    
-     3,433,389,555      instructions                     #    1.59  insn per cycle         
-       0.811650542 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.640741 sec
+     2,653,679,481      cycles                           #    2.830 GHz                       
+     4,331,233,604      instructions                     #    1.63  insn per cycle            
+       0.994615706 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.128944e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.159258e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.160487e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.091523 sec
-INFO: No Floating Point Exceptions have been reported
-     9,825,563,648      cycles                           #    2.933 GHz                    
-    22,802,776,931      instructions                     #    2.32  insn per cycle         
-       3.405923259 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.890035e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.890938e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.890938e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.779631e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.780498e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.780498e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.683864 sec
-INFO: No Floating Point Exceptions have been reported
-    25,635,022,031      cycles                           #    2.951 GHz                    
-    78,960,809,140      instructions                     #    3.08  insn per cycle         
-       8.688143049 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.221957 sec
+    26,521,385,152      cycles                           #    2.875 GHz                       
+    80,663,083,749      instructions                     #    3.04  insn per cycle            
+       9.226164416 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141133E-004
-Relative difference = 2.8372990776517314e-07
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.535619e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.538805e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.538805e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.188674e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.191422e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.191422e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.644682 sec
-INFO: No Floating Point Exceptions have been reported
-    13,070,212,228      cycles                           #    2.812 GHz                    
-    39,558,910,913      instructions                     #    3.03  insn per cycle         
-       4.648863484 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.149366 sec
+    14,036,040,292      cycles                           #    2.724 GHz                       
+    41,137,477,026      instructions                     #    2.93  insn per cycle            
+       5.153371013 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141122E-004
-Relative difference = 2.837299079287849e-07
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.974136e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.989764e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.989764e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.310052e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.323851e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.323851e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.062978 sec
-INFO: No Floating Point Exceptions have been reported
-     5,609,565,523      cycles                           #    2.715 GHz                    
-    13,823,736,601      instructions                     #    2.46  insn per cycle         
-       2.067208066 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.249818 sec
+     6,006,737,973      cycles                           #    2.666 GHz                       
+    14,679,070,171      instructions                     #    2.44  insn per cycle            
+       2.254048539 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14881) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.256862e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.278276e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.278276e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.198807e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.216988e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.216988e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.778135 sec
-INFO: No Floating Point Exceptions have been reported
-     4,913,104,520      cycles                           #    2.758 GHz                    
-    12,505,156,898      instructions                     #    2.55  insn per cycle         
-       1.782374042 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     2.006666 sec
+     5,349,449,687      cycles                           #    2.662 GHz                       
+    13,558,519,738      instructions                     #    2.53  insn per cycle            
+       2.010742077 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14564) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.040533e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.053211e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.053211e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.526022e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.537398e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.537398e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.335968 sec
-INFO: No Floating Point Exceptions have been reported
-     4,137,289,106      cycles                           #    1.769 GHz                    
-     6,392,511,975      instructions                     #    1.55  insn per cycle         
-       2.340416062 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.519591 sec
+     4,240,128,814      cycles                           #    1.681 GHz                       
+     6,889,716,156      instructions                     #    1.62  insn per cycle            
+       2.523839777 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1767) (512y:   61) (512z:14451)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..aaf196e26a
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_noBlas.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_09:22:00
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.046882e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.049339e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.049531e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     0.658941 sec
+     2,619,370,753      cycles                           #    2.830 GHz                       
+     4,315,879,983      instructions                     #    1.65  insn per cycle            
+       0.985069394 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.777719e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.778564e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.778564e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     9.231985 sec
+    26,530,286,433      cycles                           #    2.873 GHz                       
+    80,663,160,739      instructions                     #    3.04  insn per cycle            
+       9.235966447 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.214661e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.217440e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.217440e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     5.107602 sec
+    14,019,762,133      cycles                           #    2.743 GHz                       
+    41,137,185,286      instructions                     #    2.93  insn per cycle            
+       5.111806436 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.325470e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.339668e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.339668e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.245120 sec
+     6,000,634,007      cycles                           #    2.669 GHz                       
+    14,678,782,792      instructions                     #    2.45  insn per cycle            
+       2.249347946 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14881) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.223501e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.241227e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.241227e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.000737 sec
+     5,334,637,803      cycles                           #    2.662 GHz                       
+    13,558,380,332      instructions                     #    2.54  insn per cycle            
+       2.004961011 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14564) (512y:   45) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.549501e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.560889e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.560889e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.510605 sec
+     4,244,550,697      cycles                           #    1.689 GHz                       
+     6,889,807,087      instructions                     #    1.62  insn per cycle            
+       2.514890598 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1767) (512y:   61) (512z:14451)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731198157320E-004
+Relative difference = 2.837296634927675e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
index f2b15e4b6f..357c24f245 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,216 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:44:55
+DATE: 2025-09-24_09:07:38
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.041462e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.325366e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.327398e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.010689e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.040832e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.041036e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.520118 sec
-INFO: No Floating Point Exceptions have been reported
-     2,177,158,293      cycles                           #    2.891 GHz                    
-     3,464,316,990      instructions                     #    1.59  insn per cycle         
-       0.812097316 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
-WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.647797 sec
+     2,659,414,719      cycles                           #    2.832 GHz                       
+     4,321,757,984      instructions                     #    1.63  insn per cycle            
+       0.998229949 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP=
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.734798e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.174453e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.175668e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.213650 sec
-INFO: No Floating Point Exceptions have been reported
-    10,150,922,529      cycles                           #    2.918 GHz                    
-    23,231,659,490      instructions                     #    2.29  insn per cycle         
-       3.538737264 seconds time elapsed
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.885407e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.886309e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.886309e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.777828e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.778669e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.778669e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.705137 sec
-INFO: No Floating Point Exceptions have been reported
-    25,650,530,800      cycles                           #    2.946 GHz                    
-    78,960,008,246      instructions                     #    3.08  insn per cycle         
-       8.709419634 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4842) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.231282 sec
+    26,561,084,123      cycles                           #    2.876 GHz                       
+    80,664,781,232      instructions                     #    3.04  insn per cycle            
+       9.235283514 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141133E-004
-Relative difference = 2.8372990776517314e-07
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.551750e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.554937e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.554937e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.218246e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.221081e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.221081e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.623453 sec
-INFO: No Floating Point Exceptions have been reported
-    13,056,946,389      cycles                           #    2.822 GHz                    
-    39,559,090,760      instructions                     #    3.03  insn per cycle         
-       4.627712527 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13192) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.102107 sec
+    14,046,126,931      cycles                           #    2.752 GHz                       
+    41,137,219,329      instructions                     #    2.93  insn per cycle            
+       5.106189798 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20652) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141122E-004
-Relative difference = 2.837299079287849e-07
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.090893e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.106933e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.106933e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.245451e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.259263e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.259263e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.033338 sec
-INFO: No Floating Point Exceptions have been reported
-     5,609,780,879      cycles                           #    2.754 GHz                    
-    13,824,722,765      instructions                     #    2.46  insn per cycle         
-       2.037509617 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11520) (512y:    0) (512z:    0)
+TOTAL       :     2.270158 sec
+     6,034,991,732      cycles                           #    2.655 GHz                       
+    14,679,285,512      instructions                     #    2.43  insn per cycle            
+       2.274258617 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14881) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.188897e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.209893e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.209893e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.183410e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.201402e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.201402e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.791081 sec
-INFO: No Floating Point Exceptions have been reported
-     4,916,057,270      cycles                           #    2.740 GHz                    
-    12,505,186,935      instructions                     #    2.54  insn per cycle         
-       1.795355106 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10439) (512y:   89) (512z:    0)
+TOTAL       :     2.010584 sec
+     5,334,939,311      cycles                           #    2.649 GHz                       
+    13,558,672,256      instructions                     #    2.54  insn per cycle            
+       2.014695525 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14564) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.019116e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.031683e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.031683e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.571999e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.583351e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.583351e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.343107 sec
-INFO: No Floating Point Exceptions have been reported
-     4,136,898,273      cycles                           #    1.763 GHz                    
-     6,392,336,539      instructions                     #    1.55  insn per cycle         
-       2.347534329 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1978) (512y:  101) (512z: 9386)
+TOTAL       :     2.502044 sec
+     4,233,681,993      cycles                           #    1.690 GHz                       
+     6,889,572,033      instructions                     #    1.63  insn per cycle            
+       2.506227900 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1767) (512y:   61) (512z:14451)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
index 99e413a8a3..29ef532558 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:03:14
+DATE: 2025-09-24_07:52:38
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.332738e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.357821e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.359802e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.036183e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.039169e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.039395e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.542209 sec
-INFO: No Floating Point Exceptions have been reported
-     2,220,139,727      cycles                           #    2.875 GHz                    
-     3,465,138,857      instructions                     #    1.56  insn per cycle         
-       0.835706398 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.662823 sec
+     2,690,649,108      cycles                           #    2.828 GHz                       
+     4,286,045,440      instructions                     #    1.59  insn per cycle            
+       1.013510505 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.145716e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.176488e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.177708e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.039240 sec
-INFO: No Floating Point Exceptions have been reported
-     9,630,090,535      cycles                           #    2.918 GHz                    
-    21,945,170,652      instructions                     #    2.28  insn per cycle         
-       3.356721463 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158133E-004
-Relative difference = 2.837296512218831e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.881580e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.882499e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.882499e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.781146e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.781991e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.781991e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.723377 sec
-INFO: No Floating Point Exceptions have been reported
-    25,611,709,249      cycles                           #    2.935 GHz                    
-    78,703,444,126      instructions                     #    3.07  insn per cycle         
-       8.727502935 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4191) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.213856 sec
+    26,488,697,165      cycles                           #    2.874 GHz                       
+    80,598,102,732      instructions                     #    3.04  insn per cycle            
+       9.217833797 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6108) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141133E-004
-Relative difference = 2.8372990776517314e-07
+Avg ME (F77/C++)    = 6.6266731198141155E-004
+Relative difference = 2.8372990743794954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.593581e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.596889e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.596889e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.190938e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.193680e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.193680e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.571814 sec
-INFO: No Floating Point Exceptions have been reported
-    13,039,592,628      cycles                           #    2.851 GHz                    
-    39,453,086,877      instructions                     #    3.03  insn per cycle         
-       4.575893049 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:12966) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.145449 sec
+    14,122,597,527      cycles                           #    2.743 GHz                       
+    41,123,270,337      instructions                     #    2.91  insn per cycle            
+       5.149556341 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20833) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141122E-004
-Relative difference = 2.837299079287849e-07
+Avg ME (F77/C++)    = 6.6266731198141133E-004
+Relative difference = 2.8372990776517314e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.986878e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.003760e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.003760e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.251338e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.264895e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.264895e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.061484 sec
-INFO: No Floating Point Exceptions have been reported
-     5,673,128,561      cycles                           #    2.749 GHz                    
-    13,911,820,426      instructions                     #    2.45  insn per cycle         
-       2.066505881 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11582) (512y:    0) (512z:    0)
+TOTAL       :     2.268038 sec
+     6,006,066,444      cycles                           #    2.645 GHz                       
+    14,676,192,844      instructions                     #    2.44  insn per cycle            
+       2.272143843 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14857) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.098916e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.119150e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.119150e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.177622e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.195568e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.195568e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.809563 sec
-INFO: No Floating Point Exceptions have been reported
-     4,990,015,585      cycles                           #    2.753 GHz                    
-    12,604,471,256      instructions                     #    2.53  insn per cycle         
-       1.813650628 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10423) (512y:  241) (512z:    0)
+TOTAL       :     2.011600 sec
+     5,382,012,156      cycles                           #    2.671 GHz                       
+    13,555,448,039      instructions                     #    2.52  insn per cycle            
+       2.015732823 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14538) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.910207e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.922434e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.922434e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.533539e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.545433e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.545433e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.380650 sec
-INFO: No Floating Point Exceptions have been reported
-     4,192,440,259      cycles                           #    1.759 GHz                    
-     6,502,191,985      instructions                     #    1.55  insn per cycle         
-       2.384674618 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1754) (512y:  193) (512z: 9382)
+TOTAL       :     2.516570 sec
+     4,253,387,112      cycles                           #    1.688 GHz                       
+     6,887,935,639      instructions                     #    1.62  insn per cycle            
+       2.520747303 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1741) (512y:   61) (512z:14451)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198157320E-004
 Relative difference = 2.837296634927675e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
index 76362e2777..4f4356c48c 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:30:00
+DATE: 2025-09-24_08:48:02
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.108959e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.129301e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.130870e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.042496e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.045142e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.045338e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.545749 sec
-INFO: No Floating Point Exceptions have been reported
-     2,205,865,001      cycles                           #    2.840 GHz                    
-     3,412,138,367      instructions                     #    1.55  insn per cycle         
-       0.835130533 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.657301 sec
+     2,688,164,890      cycles                           #    2.832 GHz                       
+     4,316,171,739      instructions                     #    1.61  insn per cycle            
+       1.011009554 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.747537e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.771352e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.772362e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.317305 sec
-INFO: No Floating Point Exceptions have been reported
-    10,470,225,400      cycles                           #    2.928 GHz                    
-    22,893,642,046      instructions                     #    2.19  insn per cycle         
-       3.632348979 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158122E-004
-Relative difference = 2.837296513854949e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.279433e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.279917e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.279917e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.749826e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.750647e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.750647e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    38.330200 sec
-INFO: No Floating Point Exceptions have been reported
-   112,786,835,820      cycles                           #    2.943 GHz                    
-   144,812,254,859      instructions                     #    1.28  insn per cycle         
-      38.334547107 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:21273) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.379113 sec
+    26,934,267,861      cycles                           #    2.871 GHz                       
+    73,851,988,552      instructions                     #    2.74  insn per cycle            
+       9.383150461 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12960) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731198140461E-004
 Relative difference = 2.8372991790910424e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.132336e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.134792e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.134792e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.195954e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.198717e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.198717e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.242571 sec
-INFO: No Floating Point Exceptions have been reported
-    14,761,048,074      cycles                           #    2.814 GHz                    
-    37,609,615,991      instructions                     #    2.55  insn per cycle         
-       5.246531710 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:68172) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.137648 sec
+    13,984,746,789      cycles                           #    2.720 GHz                       
+    37,779,059,141      instructions                     #    2.70  insn per cycle            
+       5.141898165 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:67022) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141209E-004
-Relative difference = 2.8372990661989057e-07
+Avg ME (F77/C++)    = 6.6266731198141231E-004
+Relative difference = 2.8372990629266697e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.367426e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.381363e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.381363e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.579100e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.594477e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.594477e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.233268 sec
-INFO: No Floating Point Exceptions have been reported
-     6,121,196,467      cycles                           #    2.737 GHz                    
-    13,054,881,187      instructions                     #    2.13  insn per cycle         
-       2.237420808 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46946) (512y:    0) (512z:    0)
+TOTAL       :     2.170436 sec
+     5,754,413,095      cycles                           #    2.647 GHz                       
+    12,930,299,739      instructions                     #    2.25  insn per cycle            
+       2.174693824 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:44526) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198156789E-004
-Relative difference = 2.837296715097453e-07
+Avg ME (F77/C++)    = 6.6266731198156811E-004
+Relative difference = 2.837296711825217e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.964974e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.985321e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.985321e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.763945e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.784121e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.784121e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.836637 sec
-INFO: No Floating Point Exceptions have been reported
-     5,064,709,437      cycles                           #    2.753 GHz                    
-    11,452,008,336      instructions                     #    2.26  insn per cycle         
-       1.840705951 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40486) (512y:  285) (512z:    0)
+TOTAL       :     1.877679 sec
+     4,952,203,180      cycles                           #    2.633 GHz                       
+    11,588,997,021      instructions                     #    2.34  insn per cycle            
+       1.881750512 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:39643) (512y:   55) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198156789E-004
-Relative difference = 2.837296715097453e-07
+Avg ME (F77/C++)    = 6.6266731198156811E-004
+Relative difference = 2.837296711825217e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.358991e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.372760e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.372760e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.037416e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.050913e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.050913e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.235964 sec
-INFO: No Floating Point Exceptions have been reported
-     3,956,538,826      cycles                           #    1.767 GHz                    
-     5,928,749,634      instructions                     #    1.50  insn per cycle         
-       2.240037452 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2444) (512y:  337) (512z:39338)
+TOTAL       :     2.337052 sec
+     3,936,542,106      cycles                           #    1.682 GHz                       
+     5,928,998,571      instructions                     #    1.51  insn per cycle            
+       2.341312783 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1600) (512y:   64) (512z:38942)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198156789E-004
-Relative difference = 2.837296715097453e-07
+Avg ME (F77/C++)    = 6.6266731198156811E-004
+Relative difference = 2.837296711825217e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
index 5040f4b335..4a7bf9e008 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:31:09
+DATE: 2025-09-24_08:48:56
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.107076e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.130192e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.131670e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.039955e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.042489e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.042684e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.539226 sec
-INFO: No Floating Point Exceptions have been reported
-     2,240,615,938      cycles                           #    2.902 GHz                    
-     3,467,491,001      instructions                     #    1.55  insn per cycle         
-       0.828466018 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.659114 sec
+     2,692,518,276      cycles                           #    2.825 GHz                       
+     4,310,380,031      instructions                     #    1.60  insn per cycle            
+       1.010529095 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.751881e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.775679e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.776668e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.303070 sec
-INFO: No Floating Point Exceptions have been reported
-    10,434,569,638      cycles                           #    2.930 GHz                    
-    24,118,235,140      instructions                     #    2.31  insn per cycle         
-       3.617886016 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 124
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_d_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266731198158122E-004
-Relative difference = 2.837296513854949e-07
+Avg ME (F77/GPU)   = 6.6266731198158112E-004
+Relative difference = 2.837296515491067e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_d_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.241409e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.241886e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.241886e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.754652e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.755482e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.755482e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :    38.674103 sec
-INFO: No Floating Point Exceptions have been reported
-   113,958,477,984      cycles                           #    2.947 GHz                    
-   144,286,195,418      instructions                     #    1.27  insn per cycle         
-      38.678088373 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:21024) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.353085 sec
+    26,827,161,725      cycles                           #    2.868 GHz                       
+    73,583,039,012      instructions                     #    2.74  insn per cycle            
+       9.357114519 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12764) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198140450E-004
-Relative difference = 2.83729918072716e-07
+Avg ME (F77/C++)    = 6.6266731198140461E-004
+Relative difference = 2.8372991790910424e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.007169e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.009483e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.009483e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.209478e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.212226e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.212226e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     5.460584 sec
-INFO: No Floating Point Exceptions have been reported
-    15,281,187,875      cycles                           #    2.797 GHz                    
-    37,839,169,102      instructions                     #    2.48  insn per cycle         
-       5.464853538 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:68594) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.115971 sec
+    13,960,203,452      cycles                           #    2.727 GHz                       
+    37,775,458,315      instructions                     #    2.71  insn per cycle            
+       5.120088126 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:66935) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198141209E-004
-Relative difference = 2.8372990661989057e-07
+Avg ME (F77/C++)    = 6.6266731198141231E-004
+Relative difference = 2.8372990629266697e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.567317e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.582163e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.582163e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.651963e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.666829e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.666829e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.174218 sec
-INFO: No Floating Point Exceptions have been reported
-     6,020,206,289      cycles                           #    2.765 GHz                    
-    12,923,983,464      instructions                     #    2.15  insn per cycle         
-       2.178219828 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:46048) (512y:    0) (512z:    0)
+TOTAL       :     2.149782 sec
+     5,705,194,110      cycles                           #    2.650 GHz                       
+    12,866,644,870      instructions                     #    2.26  insn per cycle            
+       2.153952917 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:43855) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198156789E-004
-Relative difference = 2.837296715097453e-07
+Avg ME (F77/C++)    = 6.6266731198156811E-004
+Relative difference = 2.837296711825217e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.900478e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.920792e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.920792e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.882866e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.903987e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.903987e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.849478 sec
-INFO: No Floating Point Exceptions have been reported
-     5,102,330,026      cycles                           #    2.754 GHz                    
-    11,453,366,172      instructions                     #    2.24  insn per cycle         
-       1.853513717 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40151) (512y:  219) (512z:    0)
+TOTAL       :     1.852637 sec
+     4,930,923,995      cycles                           #    2.657 GHz                       
+    11,551,223,645      instructions                     #    2.34  insn per cycle            
+       1.856813121 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:39128) (512y:   43) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198156789E-004
-Relative difference = 2.837296715097453e-07
+Avg ME (F77/C++)    = 6.6266731198156811E-004
+Relative difference = 2.837296711825217e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.368242e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.382314e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.382314e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.061104e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.074586e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.074586e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.232876 sec
-INFO: No Floating Point Exceptions have been reported
-     3,951,515,189      cycles                           #    1.767 GHz                    
-     5,896,746,544      instructions                     #    1.49  insn per cycle         
-       2.236852257 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1959) (512y:  259) (512z:38977)
+TOTAL       :     2.329299 sec
+     3,932,028,853      cycles                           #    1.686 GHz                       
+     5,912,324,948      instructions                     #    1.50  insn per cycle            
+       2.333629850 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1168) (512y:   48) (512z:38635)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
-Avg ME (F77/C++)    = 6.6266731198156789E-004
-Relative difference = 2.837296715097453e-07
+Avg ME (F77/C++)    = 6.6266731198156821E-004
+Relative difference = 2.8372967101890994e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..28731b3dfa
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:20:20
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.874571e+03    1 256
+9.668344e+03    2 256
+1.898243e+04    4 256
+3.892187e+04    8 256
+7.406167e+04   16 256
+1.450850e+05   32 256
+2.285874e+05   64 256
+2.300604e+05  128 256
+2.321676e+05  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+6.430243e+02    1  32
+1.275993e+03    2  32
+2.526380e+03    4  32
+5.085785e+03    8  32
+1.005979e+04   16  32
+1.929453e+04   32  32
+3.746383e+04   64  32
+7.436476e+04  128  32
+1.449171e+05  256  32
+2.233797e+05  512  32
+2.278699e+05 1024  32
+2.267486e+05 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.815758e+03    1 256
+1.824941e+03    2 256
+1.812890e+03    4 256
+### CPU: scaling test 32
+1.799421e+03    1  32
+1.811217e+03    2  32
+1.817109e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.694201e+03    1 256
+6.530240e+03    2 256
+6.618481e+03    4 256
+### CPU: scaling test 32
+6.203984e+03    1  32
+6.493493e+03    2  32
+6.399063e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.412108e+04    1 256
+1.422537e+04    2 256
+1.448658e+04    4 256
+### CPU: scaling test 32
+1.445264e+04    1  32
+1.450156e+04    2  32
+1.391386e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.559570e+04    1 256
+1.624796e+04    2 256
+1.592395e+04    4 256
+### CPU: scaling test 32
+1.467926e+04    1  32
+1.562225e+04    2  32
+1.527629e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.314281e+04    1 256
+1.318480e+04    2 256
+1.327322e+04    4 256
+### CPU: scaling test 32
+1.281594e+04    1  32
+1.304798e+04    2  32
+1.317554e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
index c4676334b0..9767c1c873 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:04:57
+DATE: 2025-09-24_07:55:17
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.476973e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.519601e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.523500e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.264243e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.271869e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.272617e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.498075 sec
-INFO: No Floating Point Exceptions have been reported
-     2,049,620,143      cycles                           #    2.856 GHz                    
-     3,058,097,989      instructions                     #    1.49  insn per cycle         
-       0.977244524 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.573086 sec
+     2,350,779,156      cycles                           #    2.827 GHz                       
+     3,450,243,943      instructions                     #    1.47  insn per cycle            
+       0.888014676 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.124860e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.187008e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.189727e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.797790 sec
-INFO: No Floating Point Exceptions have been reported
-     5,916,497,978      cycles                           #    2.910 GHz                    
-    12,115,730,956      instructions                     #    2.05  insn per cycle         
-       2.090370837 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+Avg ME (F77/GPU)   = 6.6262666367186696E-004
+Relative difference = 2.827504444018108e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.932981e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.933931e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.933931e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.809043e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.809885e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.809885e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.490769 sec
-INFO: No Floating Point Exceptions have been reported
-    24,922,868,630      cycles                           #    2.935 GHz                    
-    79,110,265,707      instructions                     #    3.17  insn per cycle         
-       8.496015758 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.070907 sec
+    26,076,793,123      cycles                           #    2.874 GHz                       
+    81,082,815,652      instructions                     #    3.11  insn per cycle            
+       9.074860218 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865557072044E-004
+Relative difference = 6.703789776019192e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.975543e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.988298e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.988298e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.356100 sec
-INFO: No Floating Point Exceptions have been reported
-     6,536,263,436      cycles                           #    2.771 GHz                    
-    20,271,266,485      instructions                     #    3.10  insn per cycle         
-       2.362378155 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 6.599536e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.610894e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.610894e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060120e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.490195 sec
+     6,817,962,157      cycles                           #    2.735 GHz                       
+    21,064,372,112      instructions                     #    3.09  insn per cycle            
+       2.494235927 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274861442972011E-004
-Relative difference = 2.1772539563413118e-08
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.588631e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.595153e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.595153e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.435208e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.440763e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.440763e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.038490 sec
-INFO: No Floating Point Exceptions have been reported
-     2,837,721,779      cycles                           #    2.726 GHz                    
-     7,066,858,765      instructions                     #    2.49  insn per cycle         
-       1.044464831 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.147813 sec
+     3,056,395,258      cycles                           #    2.655 GHz                       
+     7,494,074,266      instructions                     #    2.45  insn per cycle            
+       1.151895136 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15414) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.762421e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.770702e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.770702e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.554794e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.561237e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.561237e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.936394 sec
-INFO: No Floating Point Exceptions have been reported
-     2,577,125,275      cycles                           #    2.745 GHz                    
-     6,404,206,024      instructions                     #    2.49  insn per cycle         
-       0.941322355 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     1.059930 sec
+     2,742,793,309      cycles                           #    2.580 GHz                       
+     6,930,877,377      instructions                     #    2.53  insn per cycle            
+       1.063917437 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15142) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.409980e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.415034e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.415034e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.312360e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.316925e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.316925e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.170914 sec
-INFO: No Floating Point Exceptions have been reported
-     2,069,436,546      cycles                           #    1.766 GHz                    
-     3,304,699,013      instructions                     #    1.60  insn per cycle         
-       1.174781391 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.254936 sec
+     2,115,349,761      cycles                           #    1.681 GHz                       
+     3,555,960,059      instructions                     #    1.68  insn per cycle            
+       1.258998790 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2404) (512y:    5) (512z:14466)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
-Avg ME (F77/C++)    = 6.6271952779718007E-004
-Relative difference = 4.194411063934945e-08
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..5aa2f01211
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:40:35
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.512528e+03    1 256
+8.976635e+03    2 256
+1.751058e+04    4 256
+3.430598e+04    8 256
+6.666715e+04   16 256
+1.291068e+05   32 256
+2.050430e+05   64 256
+2.107149e+05  128 256
+2.155656e+05  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+5.890329e+02    1  32
+1.152212e+03    2  32
+2.292040e+03    4  32
+4.632888e+03    8  32
+9.198270e+03   16  32
+1.769087e+04   32  32
+3.431557e+04   64  32
+6.676554e+04  128  32
+1.285799e+05  256  32
+2.026054e+05  512  32
+2.091692e+05 1024  32
+2.111970e+05 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.824032e+03    1 256
+1.809863e+03    2 256
+1.820597e+03    4 256
+### CPU: scaling test 32
+1.801488e+03    1  32
+1.816095e+03    2  32
+1.818887e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.541208e+03    1 256
+6.646471e+03    2 256
+6.674420e+03    4 256
+### CPU: scaling test 32
+6.654268e+03    1  32
+6.445369e+03    2  32
+6.483277e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.403915e+04    1 256
+1.421509e+04    2 256
+1.432262e+04    4 256
+### CPU: scaling test 32
+1.327148e+04    1  32
+1.383509e+04    2  32
+1.411012e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.518580e+04    1 256
+1.604878e+04    2 256
+1.599833e+04    4 256
+### CPU: scaling test 32
+1.493653e+04    1  32
+1.536784e+04    2  32
+1.504996e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.296020e+04    1 256
+1.323754e+04    2 256
+1.324766e+04    4 256
+### CPU: scaling test 32
+1.287590e+04    1  32
+1.305082e+04    2  32
+1.315256e+04    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..f26cad1106
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_blasOn.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:35:23
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.037973e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.043230e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.043825e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     1.288940 sec
+     4,705,960,611      cycles                           #    2.831 GHz                       
+     6,563,046,338      instructions                     #    1.39  insn per cycle            
+       1.721562672 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626455e-04
+Avg ME (F77/GPU)   = 6.6262664686948802E-004
+Relative difference = 2.845130693853636e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.810761e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.811590e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.811590e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
+TOTAL       :     9.062464 sec
+    26,064,650,444      cycles                           #    2.875 GHz                       
+    81,082,234,132      instructions                     #    3.11  insn per cycle            
+       9.066348887 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865557072044E-004
+Relative difference = 6.703789776019192e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.534787e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.546219e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.546219e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060120e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.514677 sec
+     6,853,715,855      cycles                           #    2.722 GHz                       
+    21,065,386,704      instructions                     #    3.07  insn per cycle            
+       2.518541485 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627486e-04
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.442665e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.448022e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.448022e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     1.142061 sec
+     3,053,183,512      cycles                           #    2.666 GHz                       
+     7,494,362,915      instructions                     #    2.45  insn per cycle            
+       1.145914831 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15414) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.601681e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.608402e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.608402e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     1.029063 sec
+     2,740,562,821      cycles                           #    2.655 GHz                       
+     6,930,786,130      instructions                     #    2.53  insn per cycle            
+       1.033186814 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15142) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.312127e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.316575e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.316575e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
+TOTAL       :     1.258103 sec
+     2,118,915,872      cycles                           #    1.682 GHz                       
+     3,556,524,861      instructions                     #    1.68  insn per cycle            
+       1.263934339 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2404) (512y:    5) (512z:14466)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627195e-04
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
index dec260c3af..ad335111b5 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,275 +10,223 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:40:11
+DATE: 2025-09-24_08:59:11
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.924368e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.456718e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.456718e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.147378e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.246468e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.246468e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.481369 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,011,468,293      cycles                           #    2.883 GHz                    
-     2,972,689,221      instructions                     #    1.48  insn per cycle         
-       0.755097926 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.557991 sec
+     2,292,595,764      cycles                           #    2.825 GHz                       
+     3,436,278,321      instructions                     #    1.50  insn per cycle            
+       0.868439857 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.978465e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.128974e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.128974e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994248e+00 )  GeV^-4
-TOTAL       :     1.967107 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,502,759,539      cycles                           #    2.928 GHz                    
-    13,854,302,325      instructions                     #    2.13  insn per cycle         
-       2.276466534 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+Avg ME (F77/GPU)   = 6.6262666367186696E-004
+Relative difference = 2.827504444018108e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.944212e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.945160e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.945160e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.810608e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.811460e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.811460e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.443358 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    24,934,407,175      cycles                           #    2.952 GHz                    
-    79,115,502,595      instructions                     #    3.17  insn per cycle         
-       8.447759712 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.065521 sec
+    26,088,065,408      cycles                           #    2.877 GHz                       
+    81,087,099,769      instructions                     #    3.11  insn per cycle            
+       9.069721762 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865557072044E-004
+Relative difference = 6.703789776019192e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.020230e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.033459e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.033459e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.344217 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,541,090,448      cycles                           #    2.786 GHz                    
-    20,280,124,954      instructions                     #    3.10  insn per cycle         
-       2.348689069 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 6.570788e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.582625e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.582625e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060120e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.503847 sec
+     6,867,586,123      cycles                           #    2.739 GHz                       
+    21,073,651,457      instructions                     #    3.07  insn per cycle            
+       2.508233409 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274861442972011E-004
-Relative difference = 2.1772539563413118e-08
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.604920e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.611581e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.611581e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.443366e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.449038e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.449038e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.029784 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,846,767,262      cycles                           #    2.755 GHz                    
-     7,076,446,064      instructions                     #    2.49  insn per cycle         
-       1.034215836 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.143966 sec
+     3,060,195,809      cycles                           #    2.667 GHz                       
+     7,503,599,826      instructions                     #    2.45  insn per cycle            
+       1.148081805 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15414) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.797566e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.806224e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.806224e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.597626e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.604737e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.604737e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.920078 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,539,792,408      cycles                           #    2.749 GHz                    
-     6,413,266,409      instructions                     #    2.53  insn per cycle         
-       0.924434981 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     1.034413 sec
+     2,756,912,293      cycles                           #    2.656 GHz                       
+     6,940,381,664      instructions                     #    2.52  insn per cycle            
+       1.038660514 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15142) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.411104e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.416189e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.416189e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.323709e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.328328e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.328328e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.170311 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,078,956,436      cycles                           #    1.771 GHz                    
-     3,314,205,136      instructions                     #    1.59  insn per cycle         
-       1.174679954 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.247183 sec
+     2,125,047,261      cycles                           #    1.699 GHz                       
+     3,566,175,366      instructions                     #    1.68  insn per cycle            
+       1.251460212 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2404) (512y:    5) (512z:14466)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
-Avg ME (F77/C++)    = 6.6271952779718007E-004
-Relative difference = 4.194411063934945e-08
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
index 3ebd5caeb8..95ef8247c5 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:51:07
+DATE: 2025-09-24_09:16:35
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.481675e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.521755e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.525865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.240709e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.247631e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.248323e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.159396e-01 +- 3.238803e-01 )  GeV^-4
-TOTAL       :     0.477918 sec
-INFO: No Floating Point Exceptions have been reported
-     1,990,228,523      cycles                           #    2.864 GHz                    
-     2,978,927,673      instructions                     #    1.50  insn per cycle         
-       0.751663902 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.554830 sec
+     2,312,115,314      cycles                           #    2.824 GHz                       
+     3,412,833,583      instructions                     #    1.48  insn per cycle            
+       0.876321370 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.037728e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.099183e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.101846e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 1.094367e+02 +- 1.071509e+02 )  GeV^-4
-TOTAL       :     1.886731 sec
-INFO: No Floating Point Exceptions have been reported
-     6,136,710,401      cycles                           #    2.909 GHz                    
-    13,142,850,218      instructions                     #    2.14  insn per cycle         
-       2.175693489 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --common
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+Avg ME (F77/GPU)   = 6.6262666367186696E-004
+Relative difference = 2.827504444018108e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.941292e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.942240e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.942240e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.208459e-01 +- 3.253446e-01 )  GeV^-4
-TOTAL       :     8.455097 sec
-INFO: No Floating Point Exceptions have been reported
-    24,914,950,228      cycles                           #    2.946 GHz                    
-    79,111,045,664      instructions                     #    3.18  insn per cycle         
-       8.459383915 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.807718e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.808561e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.808561e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.208458e-01 +- 3.253446e-01 )  GeV^-4
+TOTAL       :     9.079227 sec
+    26,084,123,419      cycles                           #    2.872 GHz                       
+    81,083,696,494      instructions                     #    3.11  insn per cycle            
+       9.083053146 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865557072044E-004
+Relative difference = 6.703789776019192e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.977213e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.990041e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.990041e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.589773e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.602019e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.602019e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.208457e-01 +- 3.253445e-01 )  GeV^-4
-TOTAL       :     2.356205 sec
-INFO: No Floating Point Exceptions have been reported
-     6,550,546,250      cycles                           #    2.776 GHz                    
-    20,269,237,886      instructions                     #    3.09  insn per cycle         
-       2.360272003 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.494806 sec
+     6,814,877,087      cycles                           #    2.729 GHz                       
+    21,062,900,930      instructions                     #    3.09  insn per cycle            
+       2.498709374 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274861442972011E-004
-Relative difference = 2.1772539563413118e-08
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.601317e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.608084e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.608084e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.441516e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.447328e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.447328e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     1.030095 sec
-INFO: No Floating Point Exceptions have been reported
-     2,839,431,727      cycles                           #    2.748 GHz                    
-     7,063,774,184      instructions                     #    2.49  insn per cycle         
-       1.034210988 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.144362 sec
+     3,058,910,732      cycles                           #    2.667 GHz                       
+     7,493,335,738      instructions                     #    2.45  insn per cycle            
+       1.148273072 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15414) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.801735e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.810193e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.810193e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.552103e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.558635e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.558635e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214978e-01 +- 3.255521e-01 )  GeV^-4
-TOTAL       :     0.916264 sec
-INFO: No Floating Point Exceptions have been reported
-     2,529,614,240      cycles                           #    2.751 GHz                    
-     6,399,972,746      instructions                     #    2.53  insn per cycle         
-       0.920311559 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     1.063343 sec
+     2,755,997,502      cycles                           #    2.585 GHz                       
+     6,929,610,844      instructions                     #    2.51  insn per cycle            
+       1.067272546 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15142) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --common OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.413582e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.418711e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.418711e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.318308e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.322938e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.322938e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.214981e-01 +- 3.255523e-01 )  GeV^-4
-TOTAL       :     1.166574 sec
-INFO: No Floating Point Exceptions have been reported
-     2,070,023,042      cycles                           #    1.769 GHz                    
-     3,300,470,940      instructions                     #    1.59  insn per cycle         
-       1.170621524 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.250811 sec
+     2,123,388,726      cycles                           #    1.694 GHz                       
+     3,554,459,202      instructions                     #    1.67  insn per cycle            
+       1.254752290 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2404) (512y:    5) (512z:14466)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
-Avg ME (F77/C++)    = 6.6271952779718007E-004
-Relative difference = 4.194411063934945e-08
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
index 8aa78a916d..78f451b7bd 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:48:16
+DATE: 2025-09-24_09:12:30
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.460370e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.501314e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.505347e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.253486e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.260232e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.260954e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.475676 sec
-INFO: No Floating Point Exceptions have been reported
-     1,998,344,168      cycles                           #    2.886 GHz                    
-     3,027,104,836      instructions                     #    1.51  insn per cycle         
-       0.748859673 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.556466 sec
+     2,306,511,731      cycles                           #    2.823 GHz                       
+     3,405,943,740      instructions                     #    1.48  insn per cycle            
+       0.875455709 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.172168e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.234506e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.237328e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.821851 sec
-INFO: No Floating Point Exceptions have been reported
-     6,001,499,639      cycles                           #    2.924 GHz                    
-    13,042,334,044      instructions                     #    2.17  insn per cycle         
-       2.109220847 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --curhst
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+Avg ME (F77/GPU)   = 6.6262666367186696E-004
+Relative difference = 2.827504444018108e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.941510e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.942442e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.942442e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.815805e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.816651e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.816651e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.452200 sec
-INFO: No Floating Point Exceptions have been reported
-    24,907,540,526      cycles                           #    2.946 GHz                    
-    79,109,866,227      instructions                     #    3.18  insn per cycle         
-       8.456266423 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.037183 sec
+    25,981,381,778      cycles                           #    2.874 GHz                       
+    81,082,014,607      instructions                     #    3.12  insn per cycle            
+       9.041093443 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865557072044E-004
+Relative difference = 6.703789776019192e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.017369e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.030395e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.030395e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.341887 sec
-INFO: No Floating Point Exceptions have been reported
-     6,533,658,672      cycles                           #    2.786 GHz                    
-    20,270,788,705      instructions                     #    3.10  insn per cycle         
-       2.345994128 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 6.589046e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.600832e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.600832e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060120e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.493906 sec
+     6,799,938,748      cycles                           #    2.723 GHz                       
+    21,064,168,821      instructions                     #    3.10  insn per cycle            
+       2.497884091 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274861442972011E-004
-Relative difference = 2.1772539563413118e-08
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.604029e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.610893e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.610893e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.442997e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.448430e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.448430e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.027451 sec
-INFO: No Floating Point Exceptions have been reported
-     2,836,206,155      cycles                           #    2.751 GHz                    
-     7,065,988,768      instructions                     #    2.49  insn per cycle         
-       1.031531216 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.141752 sec
+     3,049,041,388      cycles                           #    2.663 GHz                       
+     7,493,851,873      instructions                     #    2.46  insn per cycle            
+       1.145791302 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15414) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.796598e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.804847e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.804847e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.606353e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.613147e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.613147e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.917896 sec
-INFO: No Floating Point Exceptions have been reported
-     2,527,698,465      cycles                           #    2.744 GHz                    
-     6,403,574,368      instructions                     #    2.53  insn per cycle         
-       0.921906155 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     1.026037 sec
+     2,741,511,931      cycles                           #    2.664 GHz                       
+     6,930,338,040      instructions                     #    2.53  insn per cycle            
+       1.030062983 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15142) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --curhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.414079e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419125e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419125e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.320960e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.325425e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.325425e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.164994 sec
-INFO: No Floating Point Exceptions have been reported
-     2,068,678,617      cycles                           #    1.770 GHz                    
-     3,304,093,166      instructions                     #    1.60  insn per cycle         
-       1.169236265 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.246929 sec
+     2,118,661,680      cycles                           #    1.695 GHz                       
+     3,556,802,211      instructions                     #    1.68  insn per cycle            
+       1.251020606 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2404) (512y:    5) (512z:14466)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
-Avg ME (F77/C++)    = 6.6271952779718007E-004
-Relative difference = 4.194411063934945e-08
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..014c2362f3
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_noBlas.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_09:23:46
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.277839e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.284276e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.284966e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     0.572758 sec
+     2,262,778,540      cycles                           #    2.823 GHz                       
+     3,427,514,498      instructions                     #    1.51  insn per cycle            
+       0.858446661 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626454e-04
+Avg ME (F77/GPU)   = 6.6262666367186696E-004
+Relative difference = 2.827504444018108e-05
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.805032e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.805867e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.805867e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
+TOTAL       :     9.091007 sec
+    26,124,244,226      cycles                           #    2.873 GHz                       
+    81,083,408,377      instructions                     #    3.10  insn per cycle            
+       9.094976493 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865557072044E-004
+Relative difference = 6.703789776019192e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.666139e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.678572e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.678572e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060120e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.465205 sec
+     6,801,628,597      cycles                           #    2.755 GHz                       
+    21,064,284,503      instructions                     #    3.10  insn per cycle            
+       2.469351630 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627486e-04
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.445029e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.450720e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.450720e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     1.139925 sec
+     3,050,112,240      cycles                           #    2.668 GHz                       
+     7,494,303,135      instructions                     #    2.46  insn per cycle            
+       1.144054256 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15414) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.602494e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.609389e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.609389e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
+TOTAL       :     1.028625 sec
+     2,754,615,458      cycles                           #    2.670 GHz                       
+     6,930,577,140      instructions                     #    2.52  insn per cycle            
+       1.032612157 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15142) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = FLOAT (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 1.317313e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.321878e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.321878e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
+TOTAL       :     1.250158 sec
+     2,118,002,606      cycles                           #    1.690 GHz                       
+     3,556,013,914      instructions                     #    1.68  insn per cycle            
+       1.254237663 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2404) (512y:    5) (512z:14466)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627195e-04
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
index 59696ff16e..6feb1df6d4 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,254 +10,219 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:45:30
+DATE: 2025-09-24_09:08:31
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst OMP=
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.026958e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.513975e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.517845e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.187538e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.276791e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.277504e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.048178e+00 +- 2.364571e+00 )  GeV^-4
-TOTAL       :     0.478506 sec
-INFO: No Floating Point Exceptions have been reported
-     1,992,355,788      cycles                           #    2.865 GHz                    
-     3,027,729,409      instructions                     #    1.52  insn per cycle         
-       0.751914958 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
-WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.555785 sec
+     2,284,015,691      cycles                           #    2.828 GHz                       
+     3,386,185,673      instructions                     #    1.48  insn per cycle            
+       0.866642253 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --rmbhst OMP=
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --rmbhst
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.156008e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.226322e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.229025e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.641709e+00 +- 4.994248e+00 )  GeV^-4
-TOTAL       :     1.900625 sec
-INFO: No Floating Point Exceptions have been reported
-     6,225,372,770      cycles                           #    2.919 GHz                    
-    12,616,761,411      instructions                     #    2.03  insn per cycle         
-       2.188103626 seconds time elapsed
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+Avg ME (F77/GPU)   = 6.6262666367186696E-004
+Relative difference = 2.827504444018108e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.942577e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.943527e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.943527e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.811669e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.812527e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.812527e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.447888 sec
-INFO: No Floating Point Exceptions have been reported
-    24,912,816,300      cycles                           #    2.948 GHz                    
-    79,110,249,403      instructions                     #    3.18  insn per cycle         
-       8.452014602 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3572) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.057855 sec
+    26,043,974,250      cycles                           #    2.875 GHz                       
+    81,082,670,791      instructions                     #    3.11  insn per cycle            
+       9.061675603 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5120) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274863312764526E-004
-Relative difference = 4.998523613136231e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627487e-04
+Avg ME (F77/C++)    = 6.6274865557072044E-004
+Relative difference = 6.703789776019192e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.980733e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.993141e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.993141e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.354354 sec
-INFO: No Floating Point Exceptions have been reported
-     6,535,460,807      cycles                           #    2.772 GHz                    
-    20,270,869,690      instructions                     #    3.10  insn per cycle         
-       2.358646539 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13779) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 6.616188e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.628081e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.628081e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060120e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.483899 sec
+     6,798,798,697      cycles                           #    2.734 GHz                       
+    21,064,458,791      instructions                     #    3.10  insn per cycle            
+       2.487889319 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21250) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274861442972011E-004
-Relative difference = 2.1772539563413118e-08
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.603543e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.610156e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.610156e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.444931e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.450581e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.450581e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.027888 sec
-INFO: No Floating Point Exceptions have been reported
-     2,837,672,612      cycles                           #    2.752 GHz                    
-     7,066,358,168      instructions                     #    2.49  insn per cycle         
-       1.031930682 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12055) (512y:    0) (512z:    0)
+TOTAL       :     1.140030 sec
+     3,048,561,490      cycles                           #    2.667 GHz                       
+     7,493,968,699      instructions                     #    2.46  insn per cycle            
+       1.144111649 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15414) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.798975e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.807399e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.807399e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.598657e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.605554e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.605554e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.916670 sec
-INFO: No Floating Point Exceptions have been reported
-     2,525,901,356      cycles                           #    2.745 GHz                    
-     6,403,453,175      instructions                     #    2.54  insn per cycle         
-       0.920789172 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11019) (512y:   44) (512z:    0)
+TOTAL       :     1.030819 sec
+     2,744,491,104      cycles                           #    2.654 GHz                       
+     6,930,618,312      instructions                     #    2.53  insn per cycle            
+       1.035010575 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15142) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271938174396888E-004
-Relative difference = 2.7547150614455683e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 1 --rmbhst OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.406582e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.411589e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.411589e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.321285e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.325896e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.325896e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.171278 sec
-INFO: No Floating Point Exceptions have been reported
-     2,071,908,739      cycles                           #    1.764 GHz                    
-     3,303,987,486      instructions                     #    1.59  insn per cycle         
-       1.175442581 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2603) (512y:   44) (512z: 9605)
+TOTAL       :     1.246497 sec
+     2,116,511,021      cycles                           #    1.694 GHz                       
+     3,556,085,472      instructions                     #    1.68  insn per cycle            
+       1.250375518 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2404) (512y:    5) (512z:14466)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
-Avg ME (F77/C++)    = 6.6271952779718007E-004
-Relative difference = 4.194411063934945e-08
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
index fc006f8d57..d1663b8a42 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:05:24
+DATE: 2025-09-24_07:56:03
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.473150e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.513248e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.516891e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.271341e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.278856e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.279619e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.500476 sec
-INFO: No Floating Point Exceptions have been reported
-     2,066,687,911      cycles                           #    2.859 GHz                    
-     3,064,980,702      instructions                     #    1.48  insn per cycle         
-       0.941605450 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.570407 sec
+     2,331,140,164      cycles                           #    2.830 GHz                       
+     3,426,106,320      instructions                     #    1.47  insn per cycle            
+       0.880168125 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.096999e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.159101e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.161763e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.803372 sec
-INFO: No Floating Point Exceptions have been reported
-     5,931,019,959      cycles                           #    2.909 GHz                    
-    12,491,679,666      instructions                     #    2.11  insn per cycle         
-       2.096189929 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262667672387088E-004
-Relative difference = 2.825534762507892e-05
+Avg ME (F77/GPU)   = 6.6262666367186696E-004
+Relative difference = 2.827504444018108e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.927739e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.928675e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.928675e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.820817e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.821684e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.821684e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060121e+00 +- 2.367902e+00 )  GeV^-4
-TOTAL       :     8.512686 sec
-INFO: No Floating Point Exceptions have been reported
-    24,976,995,918      cycles                           #    2.933 GHz                    
-    78,849,322,260      instructions                     #    3.16  insn per cycle         
-       8.521021644 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3092) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.012115 sec
+    25,906,896,614      cycles                           #    2.874 GHz                       
+    81,026,549,335      instructions                     #    3.13  insn per cycle            
+       9.016202048 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 5064) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627487e-04
-Avg ME (F77/C++)    = 6.6274866250177339E-004
-Relative difference = 5.65798569465384e-08
+Avg ME (F77/C++)    = 6.6274865557083146E-004
+Relative difference = 6.703773024224362e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.196617e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.210064e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.210064e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060119e+00 +- 2.367901e+00 )  GeV^-4
-TOTAL       :     2.283841 sec
-INFO: No Floating Point Exceptions have been reported
-     6,462,353,077      cycles                           #    2.825 GHz                    
-    20,230,287,596      instructions                     #    3.13  insn per cycle         
-       2.291660153 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13491) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 6.595850e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.607195e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.607195e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060120e+00 +- 2.367901e+00 )  GeV^-4
+TOTAL       :     2.491441 sec
+     6,807,327,813      cycles                           #    2.729 GHz                       
+    21,056,857,927      instructions                     #    3.09  insn per cycle            
+       2.495580061 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:21429) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274861448331612E-004
-Relative difference = 2.1853408865157068e-08
+Avg ME (F77/C++)    = 6.6274862240394555E-004
+Relative difference = 3.3804591304642774e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.507603e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.513399e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.513399e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.441697e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.447219e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.447219e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     1.094262 sec
-INFO: No Floating Point Exceptions have been reported
-     2,977,852,840      cycles                           #    2.716 GHz                    
-     7,207,139,157      instructions                     #    2.42  insn per cycle         
-       1.100869463 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:12437) (512y:    0) (512z:    0)
+TOTAL       :     1.142349 sec
+     3,055,307,583      cycles                           #    2.667 GHz                       
+     7,491,997,935      instructions                     #    2.45  insn per cycle            
+       1.146425245 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15384) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271939668088170E-004
-Relative difference = 5.008331292535666e-09
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.740158e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.747960e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.747960e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.613124e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.619942e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.619942e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060560e+00 +- 2.367611e+00 )  GeV^-4
-TOTAL       :     0.947565 sec
-INFO: No Floating Point Exceptions have been reported
-     2,615,044,427      cycles                           #    2.750 GHz                    
-     6,545,142,442      instructions                     #    2.50  insn per cycle         
-       0.954571468 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11449) (512y:   27) (512z:    0)
+TOTAL       :     1.021712 sec
+     2,737,522,460      cycles                           #    2.670 GHz                       
+     6,928,494,015      instructions                     #    2.53  insn per cycle            
+       1.025892768 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:15110) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627194e-04
-Avg ME (F77/C++)    = 6.6271939668088170E-004
-Relative difference = 5.008331292535666e-09
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627193e-04
+Avg ME (F77/C++)    = 6.6271934460905568E-004
+Relative difference = 6.731214211985233e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.344321e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.349023e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.349023e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.322207e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.326725e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.326725e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 4.060562e+00 +- 2.367612e+00 )  GeV^-4
-TOTAL       :     1.225060 sec
-INFO: No Floating Point Exceptions have been reported
-     2,140,395,059      cycles                           #    1.742 GHz                    
-     3,462,158,546      instructions                     #    1.62  insn per cycle         
-       1.232075146 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3037) (512y:   25) (512z: 9677)
+TOTAL       :     1.245540 sec
+     2,116,694,696      cycles                           #    1.695 GHz                       
+     3,553,801,233      instructions                     #    1.68  insn per cycle            
+       1.249553015 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2371) (512y:    5) (512z:14569)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627195e-04
-Avg ME (F77/C++)    = 6.6271952032316561E-004
-Relative difference = 3.066631594207157e-08
+Avg ME (F77/C++)    = 6.6271953539095291E-004
+Relative difference = 5.340261281526277e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
index 507fa267fb..1756d90979 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:32:18
+DATE: 2025-09-24_08:49:50
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.570913e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.612300e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.616113e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 4.059597e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.500062 sec
-INFO: No Floating Point Exceptions have been reported
-     2,077,093,809      cycles                           #    2.883 GHz                    
-     3,095,482,027      instructions                     #    1.49  insn per cycle         
-       0.782648151 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 2.273145e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.279956e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.280681e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     0.571530 sec
+     2,325,341,957      cycles                           #    2.824 GHz                       
+     3,397,614,092      instructions                     #    1.46  insn per cycle            
+       0.880946906 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.624378e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.693284e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.696098e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.736663 sec
-INFO: No Floating Point Exceptions have been reported
-     5,745,039,966      cycles                           #    2.917 GHz                    
-    12,243,347,327      instructions                     #    2.13  insn per cycle         
-       2.029186282 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262669162351490E-004
-Relative difference = 2.8232862531213374e-05
+Avg ME (F77/GPU)   = 6.6262666367365719E-004
+Relative difference = 2.827504173853307e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.610943e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.611718e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.611718e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.865260e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.866153e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.866153e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    29.233986 sec
-INFO: No Floating Point Exceptions have been reported
-    86,131,386,822      cycles                           #    2.946 GHz                    
-   135,652,659,903      instructions                     #    1.57  insn per cycle         
-      29.237672033 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:15856) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     8.797861 sec
+    25,249,208,808      cycles                           #    2.869 GHz                       
+    74,028,397,772      instructions                     #    2.93  insn per cycle            
+       8.801741537 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13828) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627535e-04
-Avg ME (F77/C++)    = 6.6275349717465765E-004
-Relative difference = 4.26303654465793e-09
+Avg ME (F77/C++)    = 6.6275353240358159E-004
+Relative difference = 4.8892358250989e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.849906e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.862163e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.862163e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.399244 sec
-INFO: No Floating Point Exceptions have been reported
-     6,757,771,203      cycles                           #    2.813 GHz                    
-    19,352,943,673      instructions                     #    2.86  insn per cycle         
-       2.403059869 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:69577) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 7.122488e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.136140e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.136140e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059961e+00 +- 2.367791e+00 )  GeV^-4
+TOTAL       :     2.307551 sec
+     6,280,992,865      cycles                           #    2.718 GHz                       
+    19,272,778,154      instructions                     #    3.07  insn per cycle            
+       2.311503285 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:67926) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274862748188362E-004
-Relative difference = 4.14665283800746e-08
+Avg ME (F77/C++)    = 6.6274861529819207E-004
+Relative difference = 2.308294891171356e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.430057e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.435326e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.435326e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.151867 sec
-INFO: No Floating Point Exceptions have been reported
-     3,169,480,733      cycles                           #    2.744 GHz                    
-     6,794,963,559      instructions                     #    2.14  insn per cycle         
-       1.155607574 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:49034) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.512821e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.519072e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.519072e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060955e+00 +- 2.367411e+00 )  GeV^-4
+TOTAL       :     1.089282 sec
+     2,902,145,708      cycles                           #    2.657 GHz                       
+     6,574,950,454      instructions                     #    2.27  insn per cycle            
+       1.093366801 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:45256) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627273e-04
-Avg ME (F77/C++)    = 6.6272731568543797E-004
-Relative difference = 2.3668012430631962e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627271e-04
+Avg ME (F77/C++)    = 6.6272713594379973E-004
+Relative difference = 5.423620023149683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.731154e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.739005e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.739005e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     0.952402 sec
-INFO: No Floating Point Exceptions have been reported
-     2,622,407,179      cycles                           #    2.744 GHz                    
-     5,970,044,618      instructions                     #    2.28  insn per cycle         
-       0.956238068 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:42602) (512y:   11) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.730871e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.738738e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.738738e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060955e+00 +- 2.367411e+00 )  GeV^-4
+TOTAL       :     0.952726 sec
+     2,511,239,490      cycles                           #    2.626 GHz                       
+     5,909,367,922      instructions                     #    2.35  insn per cycle            
+       0.956999585 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:40460) (512y:    5) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627273e-04
-Avg ME (F77/C++)    = 6.6272731568543797E-004
-Relative difference = 2.3668012430631962e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627271e-04
+Avg ME (F77/C++)    = 6.6272713594379973E-004
+Relative difference = 5.423620023149683e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.414435e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419474e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419474e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.165045 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,228,248      cycles                           #    1.769 GHz                    
-     3,495,098,954      instructions                     #    1.69  insn per cycle         
-       1.168981438 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 5208) (512y:    3) (512z:44858)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.413637e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.418865e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.418865e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060957e+00 +- 2.367412e+00 )  GeV^-4
+TOTAL       :     1.165430 sec
+     1,976,421,523      cycles                           #    1.691 GHz                       
+     3,061,294,918      instructions                     #    1.55  insn per cycle            
+       1.169355115 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2273) (512y:    8) (512z:39431)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627275e-04
-Avg ME (F77/C++)    = 6.6272750237027223E-004
-Relative difference = 3.5765412974815996e-09
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627273e-04
+Avg ME (F77/C++)    = 6.6272731685135949E-004
+Relative difference = 2.5427290380992266e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
index 2595c32afa..68c286c55a 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:33:09
+DATE: 2025-09-24_08:50:37
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.573938e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.613715e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.617455e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 4.059597e+00 +- 2.368053e+00 )  GeV^-4
-TOTAL       :     0.493227 sec
-INFO: No Floating Point Exceptions have been reported
-     2,049,677,908      cycles                           #    2.879 GHz                    
-     3,032,655,926      instructions                     #    1.48  insn per cycle         
-       0.769218706 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 2.266253e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.272891e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.273577e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059596e+00 +- 2.368053e+00 )  GeV^-4
+TOTAL       :     0.573598 sec
+     2,364,434,436      cycles                           #    2.823 GHz                       
+     3,420,738,261      instructions                     #    1.45  insn per cycle            
+       0.896007812 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.673337e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.742674e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.745488e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.664703e+00 +- 5.072736e+00 )  GeV^-4
-TOTAL       :     1.731870 sec
-INFO: No Floating Point Exceptions have been reported
-     5,773,880,906      cycles                           #    2.919 GHz                    
-    12,286,627,464      instructions                     #    2.13  insn per cycle         
-       2.034768323 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 90
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_f_inl1_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626454e-04
-Avg ME (F77/GPU)   = 6.6262669162351490E-004
-Relative difference = 2.8232862531213374e-05
+Avg ME (F77/GPU)   = 6.6262666367365719E-004
+Relative difference = 2.827504173853307e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_f_inl1_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.600277e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.601076e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.601076e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367799e+00 )  GeV^-4
-TOTAL       :    29.289301 sec
-INFO: No Floating Point Exceptions have been reported
-    86,207,606,672      cycles                           #    2.943 GHz                    
-   135,355,986,373      instructions                     #    1.57  insn per cycle         
-      29.293063672 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:15471) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.866837e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.867760e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.867760e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059969e+00 +- 2.367800e+00 )  GeV^-4
+TOTAL       :     8.790005 sec
+    25,260,397,225      cycles                           #    2.873 GHz                       
+    73,871,409,822      instructions                     #    2.92  insn per cycle            
+       8.793865449 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13598) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627535e-04
-Avg ME (F77/C++)    = 6.6275349662128086E-004
-Relative difference = 5.098002770919431e-09
+Avg ME (F77/C++)    = 6.6275351788769310E-004
+Relative difference = 2.698996393512682e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.848001e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.860244e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.860244e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 4.059962e+00 +- 2.367792e+00 )  GeV^-4
-TOTAL       :     2.399823 sec
-INFO: No Floating Point Exceptions have been reported
-     6,855,955,670      cycles                           #    2.853 GHz                    
-    19,471,788,292      instructions                     #    2.84  insn per cycle         
-       2.403723205 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:69876) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 7.194093e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.207887e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.207887e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.059961e+00 +- 2.367791e+00 )  GeV^-4
+TOTAL       :     2.284649 sec
+     6,280,560,769      cycles                           #    2.746 GHz                       
+    19,274,584,783      instructions                     #    3.07  insn per cycle            
+       2.288743756 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:67594) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.627486e-04
-Avg ME (F77/C++)    = 6.6274862799683282E-004
-Relative difference = 4.2243518621014775e-08
+Avg ME (F77/C++)    = 6.6274857166543494E-004
+Relative difference = 4.275311189565278e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.455129e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.460639e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.460639e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.132031 sec
-INFO: No Floating Point Exceptions have been reported
-     3,102,391,764      cycles                           #    2.733 GHz                    
-     6,715,014,781      instructions                     #    2.16  insn per cycle         
-       1.135898458 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:47692) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.508483e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.514409e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.514409e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060954e+00 +- 2.367410e+00 )  GeV^-4
+TOTAL       :     1.092368 sec
+     2,897,281,270      cycles                           #    2.643 GHz                       
+     6,533,267,590      instructions                     #    2.25  insn per cycle            
+       1.098506205 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:44412) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627273e-04
-Avg ME (F77/C++)    = 6.6272731623419345E-004
-Relative difference = 2.449603850635964e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627272e-04
+Avg ME (F77/C++)    = 6.6272715143205624E-004
+Relative difference = 7.328497113229569e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.738588e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.746518e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.746518e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060903e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     0.948137 sec
-INFO: No Floating Point Exceptions have been reported
-     2,626,199,962      cycles                           #    2.761 GHz                    
-     5,966,019,567      instructions                     #    2.27  insn per cycle         
-       0.951931849 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:41858) (512y:   13) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.760309e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.768467e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.768467e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060954e+00 +- 2.367410e+00 )  GeV^-4
+TOTAL       :     0.936509 sec
+     2,500,675,539      cycles                           #    2.661 GHz                       
+     5,890,044,790      instructions                     #    2.36  insn per cycle            
+       0.940573254 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:39822) (512y:    3) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627273e-04
-Avg ME (F77/C++)    = 6.6272731623419345E-004
-Relative difference = 2.449603850635964e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627272e-04
+Avg ME (F77/C++)    = 6.6272715143205624E-004
+Relative difference = 7.328497113229569e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=1] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.414552e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.419616e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.419616e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 4.060905e+00 +- 2.367377e+00 )  GeV^-4
-TOTAL       :     1.164736 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,746,434      cycles                           #    1.771 GHz                    
-     3,487,891,958      instructions                     #    1.69  insn per cycle         
-       1.168545250 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4171) (512y:    4) (512z:44494)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.418341e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.423496e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.423496e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.060957e+00 +- 2.367412e+00 )  GeV^-4
+TOTAL       :     1.161440 sec
+     1,967,033,501      cycles                           #    1.689 GHz                       
+     3,051,207,349      instructions                     #    1.55  insn per cycle            
+       1.165567150 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1679) (512y:    8) (512z:39086)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 6.627275e-04
-Avg ME (F77/C++)    = 6.6272750247886592E-004
-Relative difference = 3.740400032174438e-09
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.627273e-04
+Avg ME (F77/C++)    = 6.6272730894095799E-004
+Relative difference = 1.3491156920820374e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..f18d4f805a
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:19:42
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.822375e+03    1 256
+9.453850e+03    2 256
+1.857855e+04    4 256
+3.676283e+04    8 256
+7.275826e+04   16 256
+1.153304e+05   32 256
+1.042562e+05   64 256
+1.034053e+05  128 256
+1.073056e+05  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+6.199679e+02    1  32
+1.236165e+03    2  32
+2.466449e+03    4  32
+4.895667e+03    8  32
+9.549456e+03   16  32
+1.878846e+04   32  32
+3.717465e+04   64  32
+7.269473e+04  128  32
+1.164270e+05  256  32
+1.038789e+05  512  32
+1.011077e+05 1024  32
+1.035741e+05 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.757500e+03    1 256
+1.764699e+03    2 256
+1.759218e+03    4 256
+### CPU: scaling test 32
+1.684513e+03    1  32
+1.737222e+03    2  32
+1.742163e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.397695e+03    1 256
+3.406207e+03    2 256
+3.387011e+03    4 256
+### CPU: scaling test 32
+3.380204e+03    1  32
+3.388767e+03    2  32
+3.400425e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+7.323452e+03    1 256
+7.259704e+03    2 256
+7.409045e+03    4 256
+### CPU: scaling test 32
+6.823037e+03    1  32
+7.085153e+03    2  32
+7.085134e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.310815e+03    1 256
+8.162898e+03    2 256
+8.171326e+03    4 256
+### CPU: scaling test 32
+7.804838e+03    1  32
+8.373489e+03    2  32
+8.009578e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.602547e+03    1 256
+6.595105e+03    2 256
+6.634294e+03    4 256
+### CPU: scaling test 32
+6.535754e+03    1  32
+6.615078e+03    2  32
+6.578849e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
index a3a2deda6e..cb75ca85a9 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:03:48
+DATE: 2025-09-24_07:53:31
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.318725e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.347238e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.349358e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.042537e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.045573e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.045875e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.539210 sec
-INFO: No Floating Point Exceptions have been reported
-     2,220,963,802      cycles                           #    2.880 GHz                    
-     3,406,426,816      instructions                     #    1.53  insn per cycle         
-       0.832307462 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.657183 sec
+     2,703,266,341      cycles                           #    2.831 GHz                       
+     4,321,160,228      instructions                     #    1.60  insn per cycle            
+       1.011294033 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.134167e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.164785e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.165985e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.047029 sec
-INFO: No Floating Point Exceptions have been reported
-     9,687,290,131      cycles                           #    2.924 GHz                    
-    21,862,744,253      instructions                     #    2.26  insn per cycle         
-       3.379254641 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266732376103494E-004
-Relative difference = 2.659538381540814e-07
+Avg ME (F77/GPU)   = 6.6266730799887004E-004
+Relative difference = 2.8973977137440954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.868179e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.869079e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.869079e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.757037e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.757859e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.757859e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.786228 sec
-INFO: No Floating Point Exceptions have been reported
-    25,910,148,307      cycles                           #    2.949 GHz                    
-    79,427,985,275      instructions                     #    3.07  insn per cycle         
-       8.790193498 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4775) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.340689 sec
+    26,866,907,464      cycles                           #    2.876 GHz                       
+    81,209,946,669      instructions                     #    3.02  insn per cycle            
+       9.344779590 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731406016235E-004
 Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.521065e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.524381e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.524381e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.414499e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.417525e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.417525e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.666859 sec
-INFO: No Floating Point Exceptions have been reported
-    12,831,991,791      cycles                           #    2.749 GHz                    
-    38,825,085,312      instructions                     #    3.03  insn per cycle         
-       4.671138327 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:13173) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.809619 sec
+    13,803,444,492      cycles                           #    2.868 GHz                       
+    40,349,187,189      instructions                     #    2.92  insn per cycle            
+       4.813848928 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20856) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730246908442E-004
 Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.087173e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.104021e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.104021e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.393676e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.407811e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.407811e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.035173 sec
-INFO: No Floating Point Exceptions have been reported
-     5,594,158,972      cycles                           #    2.744 GHz                    
-    13,617,938,147      instructions                     #    2.43  insn per cycle         
-       2.039272194 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11427) (512y:    0) (512z:    0)
+TOTAL       :     2.224565 sec
+     5,935,126,335      cycles                           #    2.664 GHz                       
+    14,390,194,787      instructions                     #    2.42  insn per cycle            
+       2.228759492 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14913) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.329915e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.351715e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.351715e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.270248e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.299986e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.299986e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.764965 sec
-INFO: No Floating Point Exceptions have been reported
-     4,865,961,098      cycles                           #    2.752 GHz                    
-    12,296,280,016      instructions                     #    2.53  insn per cycle         
-       1.768959352 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10331) (512y:   80) (512z:    0)
+TOTAL       :     1.993562 sec
+     5,247,295,678      cycles                           #    2.641 GHz                       
+    13,270,608,923      instructions                     #    2.53  insn per cycle            
+       1.998898042 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14597) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.944494e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.956947e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.956947e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.626375e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.637949e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.637949e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.368908 sec
-INFO: No Floating Point Exceptions have been reported
-     4,175,656,001      cycles                           #    1.761 GHz                    
-     6,394,856,033      instructions                     #    1.53  insn per cycle         
-       2.373043514 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1983) (512y:   92) (512z: 9360)
+TOTAL       :     2.481586 sec
+     4,197,558,675      cycles                           #    1.689 GHz                       
+     6,762,650,295      instructions                     #    1.61  insn per cycle            
+       2.485840885 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1800) (512y:   61) (512z:14464)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling
new file mode 100644
index 0000000000..1c869da30d
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:39:30
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+4.331545e+03    1 256
+8.426586e+03    2 256
+1.654696e+04    4 256
+3.201027e+04    8 256
+6.406175e+04   16 256
+1.019414e+05   32 256
+9.772810e+04   64 256
+9.873787e+04  128 256
+1.039265e+05  256 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+5.589995e+02    1  32
+1.112129e+03    2  32
+2.206817e+03    4  32
+4.351940e+03    8  32
+8.488479e+03   16  32
+1.664990e+04   32  32
+3.291208e+04   64  32
+6.376795e+04  128  32
+1.034208e+05  256  32
+9.712943e+04  512  32
+9.650168e+04 1024  32
+9.998255e+04 2048  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.766381e+03    1 256
+1.762261e+03    2 256
+1.766249e+03    4 256
+### CPU: scaling test 32
+1.745638e+03    1  32
+1.768353e+03    2  32
+1.766116e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.362029e+03    1 256
+3.385990e+03    2 256
+3.394280e+03    4 256
+### CPU: scaling test 32
+3.309168e+03    1  32
+3.404285e+03    2  32
+3.428103e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.836698e+03    1 256
+7.294631e+03    2 256
+7.375209e+03    4 256
+### CPU: scaling test 32
+7.320256e+03    1  32
+7.135601e+03    2  32
+7.115901e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+8.133219e+03    1 256
+8.229915e+03    2 256
+8.293007e+03    4 256
+### CPU: scaling test 32
+7.769034e+03    1  32
+8.276380e+03    2  32
+8.151187e+03    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.663933e+03    1 256
+6.694036e+03    2 256
+6.647690e+03    4 256
+### CPU: scaling test 32
+6.567727e+03    1  32
+6.528060e+03    2  32
+6.592152e+03    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt
new file mode 100644
index 0000000000..821cf30e48
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_blasOn.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_08:34:20
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=1
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 9.776170e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.799139e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.800905e+04                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.375389 sec
+     4,991,020,745      cycles                           #    2.835 GHz                       
+     7,054,085,160      instructions                     #    1.41  insn per cycle            
+       1.816386659 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266733778757203E-004
+Relative difference = 2.447870582934832e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.752679e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.753471e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.753471e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     9.363928 sec
+    26,884,271,281      cycles                           #    2.870 GHz                       
+    81,216,096,709      instructions                     #    3.02  insn per cycle            
+       9.368036247 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731406016235E-004
+Relative difference = 2.8059296349552523e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.414881e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.417986e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.417986e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     4.808799 sec
+    13,800,804,174      cycles                           #    2.868 GHz                       
+    40,351,723,491      instructions                     #    2.92  insn per cycle            
+       4.812876561 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20856) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730246908442E-004
+Relative difference = 2.98084507782618e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.409620e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.424305e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.424305e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.219807 sec
+     5,939,784,204      cycles                           #    2.672 GHz                       
+    14,390,069,622      instructions                     #    2.42  insn per cycle            
+       2.223804960 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14913) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.210020e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.227626e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.227626e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.003983 sec
+     5,246,682,786      cycles                           #    2.615 GHz                       
+    13,269,929,931      instructions                     #    2.53  insn per cycle            
+       2.008017898 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14597) (512y:   45) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.600241e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.612011e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.612011e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.491377 sec
+     4,210,144,393      cycles                           #    1.688 GHz                       
+     6,762,278,416      instructions                     #    1.61  insn per cycle            
+       2.495546734 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1800) (512y:   61) (512z:14464)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt
new file mode 100644
index 0000000000..9c6d8ca930
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0_noBlas.txt
@@ -0,0 +1,225 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasNoBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasNoBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+
+DATE: 2025-09-24_09:22:53
+
+HASBLAS=hasNoBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.039625e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.042053e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.042252e+05                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     0.660363 sec
+     2,619,828,821      cycles                           #    2.834 GHz                       
+     4,281,538,007      instructions                     #    1.63  insn per cycle            
+       0.986350069 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 6.626675e-04
+Avg ME (F77/GPU)   = 6.6266730799887004E-004
+Relative difference = 2.8973977137440954e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
+EvtsPerSec[Rmb+ME]     (23) = ( 1.756398e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.757230e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.757230e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     9.343945 sec
+    26,858,148,803      cycles                           #    2.874 GHz                       
+    81,209,073,817      instructions                     #    3.02  insn per cycle            
+       9.347982497 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6154) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266731406016235E-004
+Relative difference = 2.8059296349552523e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 3.417609e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.420711e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.420711e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     4.805076 sec
+    13,803,294,859      cycles                           #    2.871 GHz                       
+    40,349,153,644      instructions                     #    2.92  insn per cycle            
+       4.809203637 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20856) (avx2:    0) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730246908442E-004
+Relative difference = 2.98084507782618e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 7.288092e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.302009e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.302009e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.256619 sec
+     5,956,235,867      cycles                           #    2.636 GHz                       
+    14,389,746,485      instructions                     #    2.42  insn per cycle            
+       2.260701525 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14913) (512y:    0) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 8.402558e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.421025e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.421025e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     1.958309 sec
+     5,241,735,696      cycles                           #    2.672 GHz                       
+    13,269,342,480      instructions                     #    2.53  insn per cycle            
+       1.962606340 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14597) (512y:   45) (512z:    0)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
+Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
+FP precision                = MIXED (NaN/abnormal=0, zero=0)
+Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
+EvtsPerSec[Rmb+ME]     (23) = ( 6.639185e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.650544e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.650544e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
+TOTAL       :     2.476926 sec
+     4,196,346,010      cycles                           #    1.692 GHz                       
+     6,762,580,177      instructions                     #    1.61  insn per cycle            
+       2.481118279 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1800) (512y:   61) (512z:14464)
+-------------------------------------------------------------------------
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest_cpp.exe
+[  PASSED  ] 4 tests.
+DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
+DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
+-------------------------------------------------------------------------
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 6.626675e-04
+Avg ME (F77/C++)    = 6.6266730409276857E-004
+Relative difference = 2.956342832710188e-07
+OK (relative difference <= 5E-3)
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
index f598011718..1a772ed057 100644
--- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg'
 
-DATE: 2024-10-06_09:04:22
+DATE: 2025-09-24_07:54:24
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.335025e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.357927e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.359916e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.040067e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.043073e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.043284e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     0.537353 sec
-INFO: No Floating Point Exceptions have been reported
-     2,216,980,042      cycles                           #    2.869 GHz                    
-     3,463,326,813      instructions                     #    1.56  insn per cycle         
-       0.836472238 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.659649 sec
+     2,692,289,857      cycles                           #    2.834 GHz                       
+     4,308,067,581      instructions                     #    1.60  insn per cycle            
+       1.010161880 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.141323e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.172030e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.173253e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 6.665112e+00 +- 5.002651e+00 )  GeV^-4
-TOTAL       :     3.034442 sec
-INFO: No Floating Point Exceptions have been reported
-     9,665,974,027      cycles                           #    2.922 GHz                    
-    21,248,987,108      instructions                     #    2.20  insn per cycle         
-       3.363171619 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 106
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 92
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 6.626675e-04
-Avg ME (F77/GPU)   = 6.6266732376103494E-004
-Relative difference = 2.659538381540814e-07
+Avg ME (F77/GPU)   = 6.6266730799887004E-004
+Relative difference = 2.8973977137440954e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.862251e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.863154e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.863154e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.754147e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.754970e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.754970e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     8.813876 sec
-INFO: No Floating Point Exceptions have been reported
-    25,987,730,158      cycles                           #    2.948 GHz                    
-    79,453,128,863      instructions                     #    3.06  insn per cycle         
-       8.817767368 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 4431) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     9.355875 sec
+    26,882,377,088      cycles                           #    2.872 GHz                       
+    81,144,358,037      instructions                     #    3.02  insn per cycle            
+       9.359891416 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 6108) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266731406016235E-004
 Relative difference = 2.8059296349552523e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.512571e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.515785e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.515785e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.398478e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.401707e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.401707e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     4.675994 sec
-INFO: No Floating Point Exceptions have been reported
-    12,822,983,844      cycles                           #    2.741 GHz                    
-    38,780,874,555      instructions                     #    3.02  insn per cycle         
-       4.681038643 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:12935) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.832125 sec
+    13,873,927,472      cycles                           #    2.870 GHz                       
+    40,345,879,506      instructions                     #    2.91  insn per cycle            
+       4.836479951 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:20851) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730246908442E-004
 Relative difference = 2.98084507782618e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.056370e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.072927e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.072927e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.370397e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.384723e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.384723e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.042531 sec
-INFO: No Floating Point Exceptions have been reported
-     5,590,175,615      cycles                           #    2.733 GHz                    
-    13,732,675,080      instructions                     #    2.46  insn per cycle         
-       2.046647326 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:11510) (512y:    0) (512z:    0)
+TOTAL       :     2.231393 sec
+     5,956,225,034      cycles                           #    2.665 GHz                       
+    14,386,792,933      instructions                     #    2.42  insn per cycle            
+       2.235577955 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14889) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 9.148791e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.170046e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.170046e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.240910e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.259069e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.259069e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     1.800883 sec
-INFO: No Floating Point Exceptions have been reported
-     4,955,825,709      cycles                           #    2.749 GHz                    
-    12,423,990,964      instructions                     #    2.51  insn per cycle         
-       1.804980058 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:10322) (512y:  240) (512z:    0)
+TOTAL       :     1.996485 sec
+     5,327,672,851      cycles                           #    2.665 GHz                       
+    13,266,285,040      instructions                     #    2.49  insn per cycle            
+       2.000686077 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:14571) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 1 OMP=
+Process                     = SIGMA_SM_GG_TTXGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.851374e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.863307e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.863307e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.622398e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.633506e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.633506e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 4.063123e+00 +- 2.368970e+00 )  GeV^-4
-TOTAL       :     2.400794 sec
-INFO: No Floating Point Exceptions have been reported
-     4,218,682,996      cycles                           #    1.755 GHz                    
-     6,496,899,309      instructions                     #    1.54  insn per cycle         
-       2.406253121 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1806) (512y:  190) (512z: 9358)
+TOTAL       :     2.483048 sec
+     4,190,553,955      cycles                           #    1.686 GHz                       
+     6,758,161,428      instructions                     #    1.61  insn per cycle            
+       2.487181679 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1773) (512y:   61) (512z:14567)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 123 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 123 channels { 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32, 17 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 6.626675e-04
 Avg ME (F77/C++)    = 6.6266730409276857E-004
 Relative difference = 2.956342832710188e-07
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..601539335a
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-09-24_08:21:57
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.619045e+02    1 256
+5.125049e+02    2 256
+9.895442e+02    4 256
+1.869472e+03    8 256
+3.332118e+03   16 256
+4.504461e+03   32 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+3.378133e+01    1  32
+6.741007e+01    2  32
+1.342433e+02    4  32
+2.647447e+02    8  32
+5.193102e+02   16  32
+1.007228e+03   32  32
+1.897741e+03   64  32
+3.318151e+03  128  32
+4.525995e+03  256  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.167657e+01    1 256
+5.126698e+01    2 256
+5.197015e+01    4 256
+### CPU: scaling test 32
+5.039014e+01    1  32
+5.180681e+01    2  32
+5.118270e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.706331e+01    1 256
+9.963066e+01    2 256
+9.765425e+01    4 256
+### CPU: scaling test 32
+9.726834e+01    1  32
+9.960643e+01    2  32
+9.770466e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.101095e+02    1 256
+2.150546e+02    2 256
+2.136656e+02    4 256
+### CPU: scaling test 32
+2.098436e+02    1  32
+2.136973e+02    2  32
+2.141888e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.319430e+02    1 256
+2.339110e+02    2 256
+2.313243e+02    4 256
+### CPU: scaling test 32
+2.306710e+02    1  32
+2.300977e+02    2  32
+2.283924e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.509438e+02    1 256
+2.527744e+02    2 256
+2.493814e+02    4 256
+### CPU: scaling test 32
+2.520183e+02    1  32
+2.508423e+02    2  32
+2.486287e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
index 17692fc5fb..53a1d8e8ed 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,218 +25,181 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:07:10
+DATE: 2025-09-24_07:59:06
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.059500e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.059934e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.060148e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.622335e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.622504e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.622543e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.453264 sec
-INFO: No Floating Point Exceptions have been reported
-     8,089,923,192      cycles                           #    2.904 GHz                    
-    15,932,007,883      instructions                     #    1.97  insn per cycle         
-       2.843483231 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     3.969322 sec
+    12,105,688,469      cycles                           #    2.856 GHz                       
+    20,287,798,096      instructions                     #    1.68  insn per cycle            
+       4.295173144 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.246459e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.248360e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.248591e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.019480 sec
-INFO: No Floating Point Exceptions have been reported
-    12,563,980,059      cycles                           #    2.886 GHz                    
-    29,860,686,581      instructions                     #    2.38  insn per cycle         
-       4.410635015 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 114
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722595284406640E-003
-Relative difference = 3.5164777671934515e-07
+Avg ME (F77/GPU)   = 9.8722595284406675E-003
+Relative difference = 3.5164777636791134e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.535286e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.535490e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.535490e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.045437e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.045534e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.045534e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     7.007645 sec
-INFO: No Floating Point Exceptions have been reported
-    18,987,096,753      cycles                           #    2.709 GHz                    
-    53,904,905,030      instructions                     #    2.84  insn per cycle         
-       7.011475835 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    10.447866 sec
+    27,795,495,084      cycles                           #    2.660 GHz                       
+    55,256,291,469      instructions                     #    1.99  insn per cycle            
+      10.452114933 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:82767) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.576045e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.576133e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.576133e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.621858e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.622184e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.622184e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.352060 sec
-INFO: No Floating Point Exceptions have been reported
-     9,813,557,960      cycles                           #    2.925 GHz                    
-    27,153,109,398      instructions                     #    2.77  insn per cycle         
-       3.355902855 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.489538 sec
+    15,617,128,762      cycles                           #    2.844 GHz                       
+    29,177,592,109      instructions                     #    1.87  insn per cycle            
+       5.493747260 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:222579) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.392533e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.392946e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.392946e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.131961e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.132126e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.132126e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.558312 sec
-INFO: No Floating Point Exceptions have been reported
-     4,259,121,658      cycles                           #    2.728 GHz                    
-     9,591,809,021      instructions                     #    2.25  insn per cycle         
-       1.562248696 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84961) (512y:    0) (512z:    0)
+TOTAL       :     2.480198 sec
+     6,569,472,532      cycles                           #    2.645 GHz                       
+    10,539,989,164      instructions                     #    1.60  insn per cycle            
+       2.484515280 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.852746e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.853256e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.853256e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.299322e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.299676e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.299676e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.371089 sec
-INFO: No Floating Point Exceptions have been reported
-     3,728,351,942      cycles                           #    2.713 GHz                    
-     8,515,110,933      instructions                     #    2.28  insn per cycle         
-       1.374961080 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80609) (512y:   90) (512z:    0)
+TOTAL       :     2.298964 sec
+     6,068,566,192      cycles                           #    2.636 GHz                       
+     9,593,892,046      instructions                     #    1.58  insn per cycle            
+       2.303008730 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162127) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.432608e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.433087e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.433087e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.480717e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.481000e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.481000e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.541076 sec
-INFO: No Floating Point Exceptions have been reported
-     2,702,698,179      cycles                           #    1.750 GHz                    
-     4,282,306,811      instructions                     #    1.58  insn per cycle         
-       1.545099546 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2856) (512y:  102) (512z:79114)
+TOTAL       :     2.131361 sec
+     3,566,053,422      cycles                           #    1.670 GHz                       
+     4,796,665,630      instructions                     #    1.35  insn per cycle            
+       2.135773673 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3115) (512y:   61) (512z:174170)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
index 1cf857b709..8dbb1e5f31 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,242 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:40:38
+DATE: 2025-09-24_08:59:57
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.054825e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.057209e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.057209e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.621086e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.622365e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.622365e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.388056 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     7,931,671,790      cycles                           #    2.924 GHz                    
-    17,623,602,431      instructions                     #    2.22  insn per cycle         
-       2.770306640 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
+TOTAL       :     3.951447 sec
+    12,010,276,722      cycles                           #    2.850 GHz                       
+    19,757,141,721      instructions                     #    1.65  insn per cycle            
+       4.269327289 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 114
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.226146e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.260909e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.260909e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.992337 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    12,629,951,963      cycles                           #    2.926 GHz                    
-    29,269,734,483      instructions                     #    2.32  insn per cycle         
-       4.375813430 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722595284406640E-003
-Relative difference = 3.5164777671934515e-07
+Avg ME (F77/GPU)   = 9.8722595284406675E-003
+Relative difference = 3.5164777636791134e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.889828e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.890068e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.890068e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.188685e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.188792e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.188792e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.696425 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    18,936,809,312      cycles                           #    2.827 GHz                    
-    53,907,854,112      instructions                     #    2.85  insn per cycle         
-       6.700731218 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32424) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    10.175422 sec
+    26,841,477,132      cycles                           #    2.637 GHz                       
+    55,257,041,315      instructions                     #    2.06  insn per cycle            
+      10.180196300 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:82767) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.586455e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.586548e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.586548e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.844962e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.845309e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.845309e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.330534 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     9,805,857,457      cycles                           #    2.941 GHz                    
-    27,153,288,385      instructions                     #    2.77  insn per cycle         
-       3.335034911 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96492) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.365862 sec
+    15,274,413,237      cycles                           #    2.845 GHz                       
+    29,178,711,293      instructions                     #    1.91  insn per cycle            
+       5.370266744 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:222579) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.386158e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.386550e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.386550e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.094443e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.094602e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.094602e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.562759 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,284,138,212      cycles                           #    2.735 GHz                    
-     9,593,930,746      instructions                     #    2.24  insn per cycle         
-       1.567182963 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84961) (512y:    0) (512z:    0)
+TOTAL       :     2.523843 sec
+     6,646,308,975      cycles                           #    2.630 GHz                       
+    10,541,757,592      instructions                     #    1.59  insn per cycle            
+       2.528288062 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162491) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.892770e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.893321e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.893321e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.271706e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.271895e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.271895e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.359134 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     3,729,884,424      cycles                           #    2.737 GHz                    
-     8,517,697,790      instructions                     #    2.28  insn per cycle         
-       1.363667603 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80609) (512y:   90) (512z:    0)
+TOTAL       :     2.326974 sec
+     6,158,234,114      cycles                           #    2.643 GHz                       
+     9,593,932,881      instructions                     #    1.56  insn per cycle            
+       2.331315094 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162127) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.423206e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.423718e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.423718e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.463912e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.464191e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.464191e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.547281 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,698,269,777      cycles                           #    1.739 GHz                    
-     4,283,935,635      instructions                     #    1.59  insn per cycle         
-       1.552053679 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2856) (512y:  102) (512z:79114)
+TOTAL       :     2.146822 sec
+     3,574,784,477      cycles                           #    1.662 GHz                       
+     4,798,875,146      instructions                     #    1.34  insn per cycle            
+       2.151446066 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3115) (512y:   61) (512z:174170)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
index bc67f5cacf..aa58e44f42 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,218 +25,181 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:08:38
+DATE: 2025-09-24_08:02:03
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.058591e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.058974e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.059077e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.623198e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.623366e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.623412e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     2.451568 sec
-INFO: No Floating Point Exceptions have been reported
-     8,115,809,761      cycles                           #    2.919 GHz                    
-    18,292,352,744      instructions                     #    2.25  insn per cycle         
-       2.835762935 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     3.971459 sec
+    12,134,959,910      cycles                           #    2.856 GHz                       
+    20,297,893,372      instructions                     #    1.67  insn per cycle            
+       4.304261275 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.228388e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.230439e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.230672e+03                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     4.019291 sec
-INFO: No Floating Point Exceptions have been reported
-    12,725,284,497      cycles                           #    2.922 GHz                    
-    29,505,773,730      instructions                     #    2.32  insn per cycle         
-       4.410068917 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 116
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722595284406640E-003
-Relative difference = 3.5164777671934515e-07
+Avg ME (F77/GPU)   = 9.8722595284406675E-003
+Relative difference = 3.5164777636791134e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.905987e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.906203e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.906203e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.235365e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.235462e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.235462e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.685741 sec
-INFO: No Floating Point Exceptions have been reported
-    18,901,791,742      cycles                           #    2.826 GHz                    
-    53,936,334,501      instructions                     #    2.85  insn per cycle         
-       6.689520607 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32022) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    10.087603 sec
+    26,870,624,294      cycles                           #    2.663 GHz                       
+    55,196,790,786      instructions                     #    2.05  insn per cycle            
+      10.091749369 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:82721) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.555988e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.556078e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.556078e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.605466e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.605795e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.605795e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     3.395185 sec
-INFO: No Floating Point Exceptions have been reported
-     9,954,308,036      cycles                           #    2.929 GHz                    
-    27,130,330,125      instructions                     #    2.73  insn per cycle         
-       3.399134205 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96368) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.498721 sec
+    15,622,067,797      cycles                           #    2.840 GHz                       
+    29,174,932,066      instructions                     #    1.87  insn per cycle            
+       5.502837399 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:222567) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595285514851E-003
 Relative difference = 3.5163655122073967e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.364235e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.364649e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.364649e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.082981e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.083174e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.083174e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.571658 sec
-INFO: No Floating Point Exceptions have been reported
-     4,284,967,782      cycles                           #    2.721 GHz                    
-     9,585,542,173      instructions                     #    2.24  insn per cycle         
-       1.575575323 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84968) (512y:    0) (512z:    0)
+TOTAL       :     2.537103 sec
+     6,675,570,158      cycles                           #    2.628 GHz                       
+    10,538,789,893      instructions                     #    1.58  insn per cycle            
+       2.541297219 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162326) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.898680e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.899276e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.899276e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.307808e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.308027e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.308027e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.358371 sec
-INFO: No Floating Point Exceptions have been reported
-     3,717,774,700      cycles                           #    2.731 GHz                    
-     8,507,853,536      instructions                     #    2.29  insn per cycle         
-       1.362296235 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80632) (512y:  240) (512z:    0)
+TOTAL       :     2.290450 sec
+     6,052,806,247      cycles                           #    2.639 GHz                       
+     9,590,780,662      instructions                     #    1.58  insn per cycle            
+       2.294513704 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:161957) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.399522e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.400013e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.400013e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.481025e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.481302e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.481302e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     1.555521 sec
-INFO: No Floating Point Exceptions have been reported
-     2,693,302,897      cycles                           #    1.729 GHz                    
-     4,281,674,096      instructions                     #    1.59  insn per cycle         
-       1.559394081 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2693) (512y:  184) (512z:79098)
+TOTAL       :     2.131058 sec
+     3,545,703,154      cycles                           #    1.661 GHz                       
+     4,795,627,357      instructions                     #    1.35  insn per cycle            
+       2.135278118 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3051) (512y:   61) (512z:174170)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
-Avg ME (F77/C++)    = 9.8722595285411531E-003
-Relative difference = 3.516375977906115e-07
+Avg ME (F77/C++)    = 9.8722595285411514E-003
+Relative difference = 3.5163759796632844e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..eadffd584b
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-09-24_08:28:23
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.664820e+02    1 256
+5.275400e+02    2 256
+1.038071e+03    4 256
+2.026475e+03    8 256
+3.877564e+03   16 256
+6.962465e+03   32 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+3.354429e+01    1  32
+6.708439e+01    2  32
+1.339141e+02    4  32
+2.671031e+02    8  32
+5.289090e+02   16  32
+1.039818e+03   32  32
+2.032513e+03   64  32
+3.851816e+03  128  32
+6.770722e+03  256  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.791936e+01    1 256
+5.716628e+01    2 256
+5.697536e+01    4 256
+### CPU: scaling test 32
+5.805917e+01    1  32
+5.723672e+01    2  32
+5.732940e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.191686e+02    1 256
+2.173201e+02    2 256
+2.184448e+02    4 256
+### CPU: scaling test 32
+2.223714e+02    1  32
+2.214415e+02    2  32
+2.186868e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.337721e+02    1 256
+4.299182e+02    2 256
+4.290540e+02    4 256
+### CPU: scaling test 32
+4.317507e+02    1  32
+4.343360e+02    2  32
+4.248740e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.636499e+02    1 256
+4.680596e+02    2 256
+4.719526e+02    4 256
+### CPU: scaling test 32
+4.621078e+02    1  32
+4.605198e+02    2  32
+4.721265e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.018132e+02    1 256
+5.093203e+02    2 256
+5.090828e+02    4 256
+### CPU: scaling test 32
+5.115033e+02    1  32
+5.033095e+02    2  32
+5.124353e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
index e477be7c61..03a696ae52 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,218 +25,181 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:13:00
+DATE: 2025-09-24_08:10:49
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.207250e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.207995e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.208247e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.665002e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.665156e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.665198e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.762040 sec
-INFO: No Floating Point Exceptions have been reported
-     5,937,636,063      cycles                           #    2.916 GHz                    
-    12,374,083,331      instructions                     #    2.08  insn per cycle         
-       2.091996677 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     3.987731 sec
+    12,196,113,186      cycles                           #    2.857 GHz                       
+    20,128,416,611      instructions                     #    1.65  insn per cycle            
+       4.324838531 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.149439e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.150073e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.150179e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333437e-05 )  GeV^-6
-TOTAL       :     2.066345 sec
-INFO: No Floating Point Exceptions have been reported
-     6,803,203,568      cycles                           #    2.918 GHz                    
-    14,656,096,283      instructions                     #    2.15  insn per cycle         
-       2.390130877 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 76
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 9.849635e-03
-Avg ME (F77/GPU)   = 9.8712451931260159E-003
-Relative difference = 0.0021940095370046923
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 9.849634e-03
+Avg ME (F77/GPU)   = 9.8712442618092543E-003
+Relative difference = 0.00219401673293189
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.548424e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.548685e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.548685e+01                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.179003 sec
-INFO: No Floating Point Exceptions have been reported
-    18,168,840,210      cycles                           #    2.939 GHz                    
-    53,911,011,794      instructions                     #    2.97  insn per cycle         
-       6.183081263 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 5.736079e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.736198e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.736198e+01                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825039e-06 )  GeV^-6
+TOTAL       :     9.206588 sec
+    26,269,096,419      cycles                           #    2.853 GHz                       
+    55,380,240,679      instructions                     #    2.11  insn per cycle            
+       9.210704438 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:64102) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087551509E-003
-Relative difference = 2.119780432912131e-08
+Avg ME (F77/C++)    = 9.8479612086552360E-003
+Relative difference = 2.118765858747532e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.395658e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.396067e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.396067e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.556967 sec
-INFO: No Floating Point Exceptions have been reported
-     4,597,936,627      cycles                           #    2.947 GHz                    
-    13,808,300,252      instructions                     #    3.00  insn per cycle         
-       1.560798930 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.147645e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.147829e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.147829e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825036e-06 )  GeV^-6
+TOTAL       :     2.461099 sec
+     7,001,514,428      cycles                           #    2.841 GHz                       
+    14,820,625,172      instructions                     #    2.12  insn per cycle            
+       2.465392683 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:223175) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847955e-03
-Avg ME (F77/C++)    = 9.8479546896367235E-003
-Relative difference = 3.1515505172940424e-08
+Avg ME (F77/C++)    = 9.8479546895692046E-003
+Relative difference = 3.152236130775542e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.833708e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.835461e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.835461e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.292815e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.293506e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.293506e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.774770 sec
-INFO: No Floating Point Exceptions have been reported
-     2,127,367,774      cycles                           #    2.734 GHz                    
-     4,836,875,487      instructions                     #    2.27  insn per cycle         
-       0.778636721 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85494) (512y:    0) (512z:    0)
+TOTAL       :     1.232942 sec
+     3,262,019,930      cycles                           #    2.638 GHz                       
+     5,311,976,962      instructions                     #    1.63  insn per cycle            
+       1.236967483 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:163023) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
-Avg ME (F77/C++)    = 9.8929728161091246E-003
-Relative difference = 1.8588029579156084e-08
+Avg ME (F77/C++)    = 9.8929728160406412E-003
+Relative difference = 1.859495200918643e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.729108e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.731291e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.731291e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.647089e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.648057e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.648057e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.685221 sec
-INFO: No Floating Point Exceptions have been reported
-     1,884,703,570      cycles                           #    2.737 GHz                    
-     4,291,263,737      instructions                     #    2.28  insn per cycle         
-       0.689203509 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81183) (512y:   45) (512z:    0)
+TOTAL       :     1.139064 sec
+     3,024,353,048      cycles                           #    2.648 GHz                       
+     4,830,348,422      instructions                     #    1.60  insn per cycle            
+       1.143303421 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162700) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
-Avg ME (F77/C++)    = 9.8929728161091246E-003
-Relative difference = 1.8588029579156084e-08
+Avg ME (F77/C++)    = 9.8929728160406412E-003
+Relative difference = 1.859495200918643e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.870048e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.872187e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.872187e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.771101 sec
-INFO: No Floating Point Exceptions have been reported
-     1,354,646,750      cycles                           #    1.748 GHz                    
-     2,162,779,823      instructions                     #    1.60  insn per cycle         
-       0.775438585 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3481) (512y:   45) (512z:79330)
+EvtsPerSec[Rmb+ME]     (23) = ( 4.940916e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.942061e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.942061e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826770e-06 )  GeV^-6
+TOTAL       :     1.072015 sec
+     1,787,397,501      cycles                           #    1.662 GHz                       
+     2,421,351,557      instructions                     #    1.35  insn per cycle            
+       1.076381120 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3752) (512y:    5) (512z:174186)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892981e-03
-Avg ME (F77/C++)    = 9.8929811982676284E-003
-Relative difference = 2.004124217057488e-08
+Avg ME (F77/C++)    = 9.8929811982706624E-003
+Relative difference = 2.0041548855825715e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
index 09d523a948..e897ea9b26 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,242 +25,189 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:42:06
+DATE: 2025-09-24_09:02:53
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.291704e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.296560e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.296560e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187093e-05 +- 9.825663e-06 )  GeV^-6
-TOTAL       :     1.680127 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     5,675,085,647      cycles                           #    2.923 GHz                    
-    11,509,492,893      instructions                     #    2.03  insn per cycle         
-       1.997903242 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
+EvtsPerSec[Rmb+ME]     (23) = ( 2.665296e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.665912e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.665912e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187094e-05 +- 9.825665e-06 )  GeV^-6
+TOTAL       :     3.960752 sec
+    12,066,914,409      cycles                           #    2.855 GHz                       
+    20,727,627,711      instructions                     #    1.72  insn per cycle            
+       4.281530016 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 76
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.120892e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.132073e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.132073e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856440e-04 +- 8.331091e-05 )  GeV^-6
-TOTAL       :     2.037220 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     6,712,310,342      cycles                           #    2.924 GHz                    
-    13,777,135,261      instructions                     #    2.05  insn per cycle         
-       2.354099539 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 9.849635e-03
-Avg ME (F77/GPU)   = 9.8712451931260159E-003
-Relative difference = 0.0021940095370046923
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 9.849634e-03
+Avg ME (F77/GPU)   = 9.8712442618092543E-003
+Relative difference = 0.00219401673293189
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.574125e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.574397e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.574397e+01                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.159980 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-    18,121,008,944      cycles                           #    2.940 GHz                    
-    53,916,989,652      instructions                     #    2.98  insn per cycle         
-       6.164330765 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 5.618409e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.618525e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.618525e+01                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825039e-06 )  GeV^-6
+TOTAL       :     9.400011 sec
+    26,716,526,883      cycles                           #    2.842 GHz                       
+    55,382,608,020      instructions                     #    2.07  insn per cycle            
+       9.404255165 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:64102) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087551509E-003
-Relative difference = 2.119780432912131e-08
+Avg ME (F77/C++)    = 9.8479612086552360E-003
+Relative difference = 2.118765858747532e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.371688e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.372089e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.372089e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.568419 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,624,959,734      cycles                           #    2.942 GHz                    
-    13,809,578,618      instructions                     #    2.99  insn per cycle         
-       1.572870258 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:97016) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.157410e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.157582e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.157582e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825036e-06 )  GeV^-6
+TOTAL       :     2.450151 sec
+     6,971,837,515      cycles                           #    2.841 GHz                       
+    14,822,399,094      instructions                     #    2.13  insn per cycle            
+       2.454549843 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:223175) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847955e-03
-Avg ME (F77/C++)    = 9.8479546896367235E-003
-Relative difference = 3.1515505172940424e-08
+Avg ME (F77/C++)    = 9.8479546895692046E-003
+Relative difference = 3.152236130775542e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.853120e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.854860e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.854860e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.206747e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.207489e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.207489e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.772760 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,127,660,282      cycles                           #    2.740 GHz                    
-     4,839,303,130      instructions                     #    2.27  insn per cycle         
-       0.777110537 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85494) (512y:    0) (512z:    0)
+TOTAL       :     1.259294 sec
+     3,331,791,928      cycles                           #    2.638 GHz                       
+     5,313,983,443      instructions                     #    1.59  insn per cycle            
+       1.263735048 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:163023) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
-Avg ME (F77/C++)    = 9.8929728161091246E-003
-Relative difference = 1.8588029579156084e-08
+Avg ME (F77/C++)    = 9.8929728160406412E-003
+Relative difference = 1.859495200918643e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.707103e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.709607e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.709607e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.558259e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.559041e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.559041e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.687680 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,889,891,398      cycles                           #    2.733 GHz                    
-     4,293,271,631      instructions                     #    2.27  insn per cycle         
-       0.692031150 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81183) (512y:   45) (512z:    0)
+TOTAL       :     1.161340 sec
+     3,067,192,699      cycles                           #    2.633 GHz                       
+     4,832,392,668      instructions                     #    1.58  insn per cycle            
+       1.165831525 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162700) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
-Avg ME (F77/C++)    = 9.8929728161091246E-003
-Relative difference = 1.8588029579156084e-08
+Avg ME (F77/C++)    = 9.8929728160406412E-003
+Relative difference = 1.859495200918643e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=256)
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 --bridge OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.738421e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.740575e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.740575e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.785848 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,358,106,687      cycles                           #    1.720 GHz                    
-     2,165,384,980      instructions                     #    1.59  insn per cycle         
-       0.790493646 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3481) (512y:   45) (512z:79330)
+EvtsPerSec[Rmb+ME]     (23) = ( 4.955195e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.956327e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.956327e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826770e-06 )  GeV^-6
+TOTAL       :     1.069338 sec
+     1,788,653,322      cycles                           #    1.667 GHz                       
+     2,423,301,522      instructions                     #    1.35  insn per cycle            
+       1.073851131 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3752) (512y:    5) (512z:174186)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892981e-03
-Avg ME (F77/C++)    = 9.8929811982676284E-003
-Relative difference = 2.004124217057488e-08
+Avg ME (F77/C++)    = 9.8929811982706624E-003
+Relative difference = 2.0041548855825715e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
index 33a64296d4..0bbb0770e4 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,218 +25,181 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:14:03
+DATE: 2025-09-24_08:13:17
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.196404e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.197145e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.197475e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.664150e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.664292e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.664334e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.186984e-05 +- 9.824899e-06 )  GeV^-6
-TOTAL       :     1.762965 sec
-INFO: No Floating Point Exceptions have been reported
-     5,951,937,078      cycles                           #    2.924 GHz                    
-    11,910,577,864      instructions                     #    2.00  insn per cycle         
-       2.092003198 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     3.986879 sec
+    12,144,476,957      cycles                           #    2.854 GHz                       
+    20,149,489,558      instructions                     #    1.66  insn per cycle            
+       4.310343323 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.150073e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.150749e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.150840e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856829e-04 +- 8.333437e-05 )  GeV^-6
-TOTAL       :     2.074025 sec
-INFO: No Floating Point Exceptions have been reported
-     6,857,187,374      cycles                           #    2.930 GHz                    
-    14,190,515,168      instructions                     #    2.07  insn per cycle         
-       2.396988151 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 76
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 66
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
-Avg ME (C++/GPU)   = 9.849635e-03
-Avg ME (F77/GPU)   = 9.8712451931260107E-003
-Relative difference = 0.0021940095370041636
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+Avg ME (C++/GPU)   = 9.849634e-03
+Avg ME (F77/GPU)   = 9.8712442618092543E-003
+Relative difference = 0.00219401673293189
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.597266e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.597536e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.597536e+01                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825040e-06 )  GeV^-6
-TOTAL       :     6.144692 sec
-INFO: No Floating Point Exceptions have been reported
-    18,086,727,911      cycles                           #    2.942 GHz                    
-    53,895,836,183      instructions                     #    2.98  insn per cycle         
-       6.148512893 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:20141) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 5.631133e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.631247e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.631247e+01                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825039e-06 )  GeV^-6
+TOTAL       :     9.378655 sec
+    26,695,300,181      cycles                           #    2.846 GHz                       
+    55,334,943,323      instructions                     #    2.07  insn per cycle            
+       9.382767082 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:63872) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847961e-03
-Avg ME (F77/C++)    = 9.8479612087572898E-003
-Relative difference = 2.1198021522715588e-08
+Avg ME (F77/C++)    = 9.8479612086536921E-003
+Relative difference = 2.118750181351027e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.388656e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.389069e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.389069e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825037e-06 )  GeV^-6
-TOTAL       :     1.560721 sec
-INFO: No Floating Point Exceptions have been reported
-     4,571,260,015      cycles                           #    2.924 GHz                    
-    13,800,942,063      instructions                     #    3.02  insn per cycle         
-       1.564719207 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96651) (avx2:    0) (512y:    0) (512z:    0)
+EvtsPerSec[Rmb+ME]     (23) = ( 2.145880e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.146047e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.146047e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187013e-05 +- 9.825036e-06 )  GeV^-6
+TOTAL       :     2.462822 sec
+     6,999,876,095      cycles                           #    2.839 GHz                       
+    14,819,093,247      instructions                     #    2.12  insn per cycle            
+       2.466962797 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:223171) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.847955e-03
-Avg ME (F77/C++)    = 9.8479546896065809E-003
-Relative difference = 3.151856596628469e-08
+Avg ME (F77/C++)    = 9.8479546895692046E-003
+Relative difference = 3.152236130775542e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.702410e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.704003e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.704003e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.229766e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.230442e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.230442e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.789887 sec
-INFO: No Floating Point Exceptions have been reported
-     2,151,012,254      cycles                           #    2.712 GHz                    
-     4,840,938,021      instructions                     #    2.25  insn per cycle         
-       0.793816354 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:85884) (512y:    0) (512z:    0)
+TOTAL       :     1.251478 sec
+     3,312,244,464      cycles                           #    2.640 GHz                       
+     5,310,879,695      instructions                     #    1.60  insn per cycle            
+       1.255633722 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162949) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
-Avg ME (F77/C++)    = 9.8929728161091923E-003
-Relative difference = 1.85880227405429e-08
+Avg ME (F77/C++)    = 9.8929728160406412E-003
+Relative difference = 1.859495200918643e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.657646e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.659745e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.659745e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.640122e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.640921e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.640921e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826767e-06 )  GeV^-6
-TOTAL       :     0.691425 sec
-INFO: No Floating Point Exceptions have been reported
-     1,894,431,690      cycles                           #    2.727 GHz                    
-     4,294,884,277      instructions                     #    2.27  insn per cycle         
-       0.695223368 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:81725) (512y:   25) (512z:    0)
+TOTAL       :     1.140652 sec
+     3,025,617,561      cycles                           #    2.645 GHz                       
+     4,829,183,559      instructions                     #    1.60  insn per cycle            
+       1.144897027 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162624) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892973e-03
-Avg ME (F77/C++)    = 9.8929728161091923E-003
-Relative difference = 1.85880227405429e-08
+Avg ME (F77/C++)    = 9.8929728160406412E-003
+Relative difference = 1.859495200918643e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.673392e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.675470e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.675470e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826771e-06 )  GeV^-6
-TOTAL       :     0.793743 sec
-INFO: No Floating Point Exceptions have been reported
-     1,366,656,580      cycles                           #    1.715 GHz                    
-     2,169,713,805      instructions                     #    1.59  insn per cycle         
-       0.797745119 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4092) (512y:   32) (512z:79551)
+EvtsPerSec[Rmb+ME]     (23) = ( 4.947037e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.948324e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.948324e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187188e-05 +- 9.826770e-06 )  GeV^-6
+TOTAL       :     1.069982 sec
+     1,785,300,398      cycles                           #    1.663 GHz                       
+     2,420,313,481      instructions                     #    1.36  insn per cycle            
+       1.074364371 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3680) (512y:    5) (512z:174186)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.892981e-03
-Avg ME (F77/C++)    = 9.8929811982957326E-003
-Relative difference = 2.0044082998332894e-08
+Avg ME (F77/C++)    = 9.8929811982706624E-003
+Relative difference = 2.0041548855825715e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..3d5df91db0
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,118 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+make: Nothing to be done for 'all'.
+
+DATE: 2025-09-24_08:25:12
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+2.624115e+02    1 256
+5.156633e+02    2 256
+1.004282e+03    4 256
+1.930573e+03    8 256
+3.544034e+03   16 256
+4.917510e+03   32 256
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+### GPU: scaling test 32
+3.375082e+01    1  32
+6.739138e+01    2  32
+1.340518e+02    4  32
+2.656472e+02    8  32
+5.223225e+02   16  32
+1.024733e+03   32  32
+1.958721e+03   64  32
+3.529263e+03  128  32
+4.944572e+03  256  32
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+check_cuda.exe: Assertion `code == gpuSuccess' failed.
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.195066e+01    1 256
+5.199377e+01    2 256
+5.165247e+01    4 256
+### CPU: scaling test 32
+5.192832e+01    1  32
+5.136516e+01    2  32
+5.204074e+01    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.023888e+02    1 256
+1.025918e+02    2 256
+1.018366e+02    4 256
+### CPU: scaling test 32
+1.028217e+02    1  32
+1.009493e+02    2  32
+1.030107e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.229650e+02    1 256
+2.211888e+02    2 256
+2.176932e+02    4 256
+### CPU: scaling test 32
+2.234860e+02    1  32
+2.229706e+02    2  32
+2.240995e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.407797e+02    1 256
+2.442304e+02    2 256
+2.440472e+02    4 256
+### CPU: scaling test 32
+2.394156e+02    1  32
+2.430410e+02    2  32
+2.406664e+02    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.619663e+02    1 256
+2.606580e+02    2 256
+2.595136e+02    4 256
+### CPU: scaling test 32
+2.631129e+02    1  32
+2.603531e+02    2  32
+2.606993e+02    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
index a2a6307c02..2f143fdbaa 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,178 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:10:06
+DATE: 2025-09-24_08:05:00
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.665934e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.666477e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.666666e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.202831 sec
-INFO: No Floating Point Exceptions have been reported
-     7,373,914,452      cycles                           #    2.913 GHz                    
-    16,351,055,335      instructions                     #    2.22  insn per cycle         
-       2.588547453 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 2.634681e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.634859e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.634901e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
+TOTAL       :     3.911742 sec
+    11,945,203,786      cycles                           #    2.853 GHz                       
+    19,561,426,283      instructions                     #    1.64  insn per cycle            
+       4.244700831 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.110897e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.111188e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.111222e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.442430 sec
-INFO: No Floating Point Exceptions have been reported
-    11,070,694,428      cycles                           #    2.924 GHz                    
-    25,628,142,124      instructions                     #    2.31  insn per cycle         
-       3.841933628 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 114
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722599015656498E-003
-Relative difference = 3.1385249252060663e-07
+Avg ME (F77/GPU)   = 9.8722598175428403E-003
+Relative difference = 3.223634904086631e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.567548e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.567783e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.567783e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.168610e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.168705e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.168705e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.978728 sec
-INFO: No Floating Point Exceptions have been reported
-    19,201,924,470      cycles                           #    2.751 GHz                    
-    54,137,446,015      instructions                     #    2.82  insn per cycle         
-       6.982563293 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32000) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    10.202641 sec
+    27,008,826,144      cycles                           #    2.647 GHz                       
+    55,489,845,751      instructions                     #    2.05  insn per cycle            
+      10.206796166 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:82767) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595861831675E-003
 Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.526848e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.526939e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.526939e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.016248e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.016285e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.016285e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.460419 sec
-INFO: No Floating Point Exceptions have been reported
-     9,442,620,757      cycles                           #    2.727 GHz                    
-    26,188,001,033      instructions                     #    2.77  insn per cycle         
-       3.464377416 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:96049) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.198364 sec
+    14,820,744,558      cycles                           #    2.849 GHz                       
+    28,052,770,640      instructions                     #    1.89  insn per cycle            
+       5.202377269 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:222782) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594844308162E-003
 Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.548969e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.549418e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.549418e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.173574e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.173760e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.173760e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.489614 sec
-INFO: No Floating Point Exceptions have been reported
-     4,075,741,004      cycles                           #    2.731 GHz                    
-     9,249,825,182      instructions                     #    2.27  insn per cycle         
-       1.493453651 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:84390) (512y:    0) (512z:    0)
+TOTAL       :     2.431646 sec
+     6,422,559,220      cycles                           #    2.638 GHz                       
+    10,106,796,156      instructions                     #    1.57  insn per cycle            
+       2.435786868 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162525) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.098256e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.098850e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.098850e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.407760e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.407969e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.407969e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.290484 sec
-INFO: No Floating Point Exceptions have been reported
-     3,523,951,603      cycles                           #    2.724 GHz                    
-     8,183,239,467      instructions                     #    2.32  insn per cycle         
-       1.294382992 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:80015) (512y:   80) (512z:    0)
+TOTAL       :     2.195988 sec
+     5,811,914,222      cycles                           #    2.643 GHz                       
+     9,159,041,975      instructions                     #    1.58  insn per cycle            
+       2.200185899 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162162) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.495372e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.495944e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.495944e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.551665e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.551966e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.551966e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.513924 sec
-INFO: No Floating Point Exceptions have been reported
-     2,658,314,764      cycles                           #    1.752 GHz                    
-     4,173,156,780      instructions                     #    1.57  insn per cycle         
-       1.517996809 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2615) (512y:   92) (512z:78910)
+TOTAL       :     2.072914 sec
+     3,457,103,850      cycles                           #    1.665 GHz                       
+     4,585,756,837      instructions                     #    1.33  insn per cycle            
+       2.077304614 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3149) (512y:   61) (512z:174183)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
index 67fff86657..bead3fdf60 100644
--- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,6 +10,7 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 make: Nothing to be done for 'all'.
@@ -21,215 +25,178 @@ make: Nothing to be done for 'all'.
 
 make: Nothing to be done for 'all'.
 
-DATE: 2024-10-06_09:11:33
+DATE: 2025-09-24_08:07:54
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.667678e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.668217e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.668387e+02                 )  sec^-1
-MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     2.202686 sec
-INFO: No Floating Point Exceptions have been reported
-     7,336,606,843      cycles                           #    2.899 GHz                    
-    15,241,236,080      instructions                     #    2.08  insn per cycle         
-       2.586897924 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 2.631204e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.631383e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.631422e+02                 )  sec^-1
+MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
+TOTAL       :     3.909015 sec
+    11,924,587,141      cycles                           #    2.856 GHz                       
+    19,714,302,266      instructions                     #    1.65  insn per cycle            
+       4.229466301 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.107552e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.107855e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.107889e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 1.856249e-04 +- 8.329951e-05 )  GeV^-6
-TOTAL       :     3.440073 sec
-INFO: No Floating Point Exceptions have been reported
-    11,052,276,434      cycles                           #    2.923 GHz                    
-    25,411,180,343      instructions                     #    2.30  insn per cycle         
-       3.836365671 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 116
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 98
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 255
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 100%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 9.872263e-03
-Avg ME (F77/GPU)   = 9.8722599015656498E-003
-Relative difference = 3.1385249252060663e-07
+Avg ME (F77/GPU)   = 9.8722598175428403E-003
+Relative difference = 3.223634904086631e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.653903e+01                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.654105e+01                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.654105e+01                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.175853e+01                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.175960e+01                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.175960e+01                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825549e-06 )  GeV^-6
-TOTAL       :     6.889827 sec
-INFO: No Floating Point Exceptions have been reported
-    19,201,166,017      cycles                           #    2.786 GHz                    
-    54,161,677,415      instructions                     #    2.82  insn per cycle         
-       6.893652512 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:32202) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :    10.191887 sec
+    27,066,562,296      cycles                           #    2.655 GHz                       
+    55,432,709,827      instructions                     #    2.05  insn per cycle            
+      10.196042947 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:82721) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722595861831675E-003
 Relative difference = 3.457988134687711e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.552412e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.552503e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.552503e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.014775e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.014811e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.014811e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     3.403221 sec
-INFO: No Floating Point Exceptions have been reported
-     9,295,420,050      cycles                           #    2.729 GHz                    
-    26,089,296,035      instructions                     #    2.81  insn per cycle         
-       3.407123949 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:95935) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.204912 sec
+    14,810,174,964      cycles                           #    2.844 GHz                       
+    28,051,432,256      instructions                     #    1.89  insn per cycle            
+       5.208973811 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:222585) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594844308162E-003
 Relative difference = 3.5610570575237004e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.556434e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.556900e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.556900e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.185805e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.185996e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.185996e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.488620 sec
-INFO: No Floating Point Exceptions have been reported
-     4,059,104,235      cycles                           #    2.721 GHz                    
-     9,213,839,753      instructions                     #    2.27  insn per cycle         
-       1.492560916 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:83864) (512y:    0) (512z:    0)
+TOTAL       :     2.419296 sec
+     6,397,293,732      cycles                           #    2.640 GHz                       
+    10,105,068,555      instructions                     #    1.58  insn per cycle            
+       2.423434285 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162459) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.125241e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.125840e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.125840e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.353455e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.353659e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.353659e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.282211 sec
-INFO: No Floating Point Exceptions have been reported
-     3,511,408,538      cycles                           #    2.732 GHz                    
-     8,168,208,932      instructions                     #    2.33  insn per cycle         
-       1.286095846 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:79421) (512y:  230) (512z:    0)
+TOTAL       :     2.246140 sec
+     5,927,041,948      cycles                           #    2.635 GHz                       
+     9,157,270,352      instructions                     #    1.54  insn per cycle            
+       2.250289465 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:162094) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.517573e+02                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.518129e+02                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.518129e+02                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.574266e+02                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.574580e+02                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.574580e+02                 )  sec^-1
 MeanMatrixElemValue         = ( 1.187066e-05 +- 9.825548e-06 )  GeV^-6
-TOTAL       :     1.503444 sec
-INFO: No Floating Point Exceptions have been reported
-     2,622,176,822      cycles                           #    1.740 GHz                    
-     4,167,750,292      instructions                     #    1.59  insn per cycle         
-       1.507552292 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1879) (512y:  174) (512z:78884)
+TOTAL       :     2.053659 sec
+     3,414,953,228      cycles                           #    1.660 GHz                       
+     4,584,587,859      instructions                     #    1.34  insn per cycle            
+       2.057980570 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3085) (512y:   61) (512z:174183)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 1240 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 1240 channels { 1 : 32, 2 : 32, 4 : 32, 5 : 32, 7 : 32, 8 : 32, 14 : 32, 15 : 32, 16 : 32, 18 : 32, 19 : 32, 20 : 32, 22 : 32, 23 : 32, 24 : 32, 26 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 9.872263e-03
 Avg ME (F77/C++)    = 9.8722594324461913E-003
 Relative difference = 3.613714310412983e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling
new file mode 100644
index 0000000000..0c9d4dd187
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+DATE: 2025-09-24_08:20:53
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+6.114382e+05    1 256
+1.184804e+06    2 256
+2.295799e+06    4 256
+4.299152e+06    8 256
+6.781749e+06   16 256
+6.234987e+06   32 256
+4.824464e+06   64 256
+5.008277e+06  128 256
+5.002497e+06  256 256
+5.105111e+06  512 256
+5.144414e+06 1024 256
+### GPU: scaling test 32
+7.442778e+04    1  32
+1.705130e+05    2  32
+3.324580e+05    4  32
+6.361197e+05    8  32
+1.276311e+06   16  32
+2.506315e+06   32  32
+4.622242e+06   64  32
+6.583981e+06  128  32
+6.111331e+06  256  32
+4.906391e+06  512  32
+5.038809e+06 1024  32
+4.994793e+06 2048  32
+5.116613e+06 4096  32
+5.145819e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.883864e+04    1 256
+1.012169e+05    2 256
+1.022878e+05    4 256
+### CPU: scaling test 32
+9.538686e+04    1  32
+9.747153e+04    2  32
+9.954397e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.671310e+05    1 256
+1.759690e+05    2 256
+1.760443e+05    4 256
+### CPU: scaling test 32
+1.499075e+05    1  32
+1.562893e+05    2  32
+1.522789e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.113537e+05    1 256
+3.215123e+05    2 256
+2.807500e+05    4 256
+### CPU: scaling test 32
+2.961647e+05    1  32
+2.870406e+05    2  32
+2.943923e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+3.149885e+05    1 256
+3.152403e+05    2 256
+3.263600e+05    4 256
+### CPU: scaling test 32
+3.402265e+05    1  32
+3.182797e+05    2  32
+3.367517e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.236251e+05    1 256
+2.336378e+05    2 256
+2.275319e+05    4 256
+### CPU: scaling test 32
+2.341184e+05    1  32
+2.208305e+05    2  32
+2.274892e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
index 468f6865a8..a3350c15b3 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:05:51
+DATE: 2025-09-24_07:56:49
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.906944e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.902591e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.013821e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.369412e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.858836e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.893773e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.458221 sec
-INFO: No Floating Point Exceptions have been reported
-     1,930,997,109      cycles                           #    2.858 GHz                    
-     2,724,198,211      instructions                     #    1.41  insn per cycle         
-       0.805328419 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.493507 sec
+     2,094,463,287      cycles                           #    2.825 GHz                       
+     2,949,447,803      instructions                     #    1.41  insn per cycle            
+       0.799505845 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.002453e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.463176e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.675243e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.544639 sec
-INFO: No Floating Point Exceptions have been reported
-     2,250,691,324      cycles                           #    2.871 GHz                    
-     3,190,813,390      instructions                     #    1.42  insn per cycle         
-       0.843484638 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 80
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
 Avg ME (F77/GPU)   = 0.14247482467490466
 Relative difference = 5.286902838873106e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.052668e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.075406e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.075406e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.011082e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.032940e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032940e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.578445 sec
-INFO: No Floating Point Exceptions have been reported
-     4,629,037,835      cycles                           #    2.928 GHz                    
-    13,193,545,970      instructions                     #    2.85  insn per cycle         
-       1.584589009 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  707) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.641008 sec
+     4,728,634,035      cycles                           #    2.877 GHz                       
+    13,391,733,023      instructions                     #    2.83  insn per cycle            
+       1.644935255 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  751) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499481
 Relative difference = 5.286896511435107e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.869817e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.940106e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.940106e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.734169e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.798767e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.798767e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.895982 sec
-INFO: No Floating Point Exceptions have been reported
-     2,636,174,950      cycles                           #    2.931 GHz                    
-     7,556,706,256      instructions                     #    2.87  insn per cycle         
-       0.901753059 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.964620 sec
+     2,778,184,561      cycles                           #    2.870 GHz                       
+     7,810,599,541      instructions                     #    2.81  insn per cycle            
+       0.968650361 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3027) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499475
 Relative difference = 5.286896515331313e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.170738e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.377041e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.377041e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.999275e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.191668e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.191668e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.538337 sec
-INFO: No Floating Point Exceptions have been reported
-     1,492,365,440      cycles                           #    2.760 GHz                    
-     3,161,633,609      instructions                     #    2.12  insn per cycle         
-       0.543901971 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2991) (512y:    0) (512z:    0)
+TOTAL       :     0.566602 sec
+     1,524,796,725      cycles                           #    2.675 GHz                       
+     3,239,090,759      instructions                     #    2.12  insn per cycle            
+       0.570567687 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2908) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.502118e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.753079e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.753079e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.186463e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.401752e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.401752e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.488254 sec
-INFO: No Floating Point Exceptions have been reported
-     1,345,193,436      cycles                           #    2.734 GHz                    
-     3,015,805,712      instructions                     #    2.24  insn per cycle         
-       0.494320620 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2749) (512y:  104) (512z:    0)
+TOTAL       :     0.534680 sec
+     1,443,469,002      cycles                           #    2.683 GHz                       
+     3,113,047,726      instructions                     #    2.16  insn per cycle            
+       0.538728930 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2729) (512y:   51) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.340176e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.450488e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.450488e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.232076e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.339196e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.339196e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.722488 sec
-INFO: No Floating Point Exceptions have been reported
-     1,326,137,037      cycles                           #    1.826 GHz                    
-     1,964,340,659      instructions                     #    1.48  insn per cycle         
-       0.728328312 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1379) (512y:  106) (512z: 2218)
+TOTAL       :     0.756449 sec
+     1,339,965,318      cycles                           #    1.764 GHz                       
+     1,936,048,197      instructions                     #    1.44  insn per cycle            
+       0.760584848 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1540) (512y:   75) (512z: 2387)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
index a32e85fd77..f54f9f771f 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,220 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:39:10
+DATE: 2025-09-24_08:57:37
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.313371e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.590831e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.590831e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.054488e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.251839e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.251839e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.487212 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,046,207,140      cycles                           #    2.880 GHz                    
-     3,015,907,255      instructions                     #    1.47  insn per cycle         
-       0.769534809 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.519849 sec
+     2,167,632,544      cycles                           #    2.822 GHz                       
+     3,174,291,906      instructions                     #    1.46  insn per cycle            
+       0.825102214 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 80
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.228660e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.270938e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.270938e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.758730 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,917,079,859      cycles                           #    2.883 GHz                    
-     4,489,082,127      instructions                     #    1.54  insn per cycle         
-       1.069078440 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
 Avg ME (F77/GPU)   = 0.14247482467490466
 Relative difference = 5.286902838873106e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.058535e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.081557e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.081557e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.006346e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.028061e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.028061e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.574537 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,656,483,821      cycles                           #    2.950 GHz                    
-    13,198,201,576      instructions                     #    2.83  insn per cycle         
-       1.579077435 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  707) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.657332 sec
+     4,772,691,593      cycles                           #    2.873 GHz                       
+    13,397,727,029      instructions                     #    2.81  insn per cycle            
+       1.661626356 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  751) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499481
 Relative difference = 5.286896511435107e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.861172e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.931943e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.931943e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.722779e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.788329e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.788329e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.907508 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,678,662,656      cycles                           #    2.939 GHz                    
-     7,605,263,564      instructions                     #    2.84  insn per cycle         
-       0.912202227 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3099) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.979997 sec
+     2,824,593,348      cycles                           #    2.871 GHz                       
+     7,859,167,507      instructions                     #    2.78  insn per cycle            
+       0.984421558 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3027) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499475
 Relative difference = 5.286896515331313e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.153263e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.357026e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.357026e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.894342e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.081563e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.081563e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.547067 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,524,781,245      cycles                           #    2.767 GHz                    
-     3,210,388,287      instructions                     #    2.11  insn per cycle         
-       0.551691801 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2991) (512y:    0) (512z:    0)
+TOTAL       :     0.595363 sec
+     1,574,100,751      cycles                           #    2.628 GHz                       
+     3,287,987,619      instructions                     #    2.09  insn per cycle            
+       0.599823612 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2908) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.508777e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.767060e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.767060e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.137842e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.355195e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.355195e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.494747 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,383,177,469      cycles                           #    2.773 GHz                    
-     3,064,481,068      instructions                     #    2.22  insn per cycle         
-       0.499446571 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2749) (512y:  104) (512z:    0)
+TOTAL       :     0.552181 sec
+     1,491,380,997      cycles                           #    2.683 GHz                       
+     3,163,058,147      instructions                     #    2.12  insn per cycle            
+       0.556586621 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2729) (512y:   51) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.351157e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.462501e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.462501e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.211794e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.320275e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.320275e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.725065 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,357,891,290      cycles                           #    1.863 GHz                    
-     2,000,455,329      instructions                     #    1.47  insn per cycle         
-       0.729577819 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1379) (512y:  106) (512z: 2218)
+TOTAL       :     0.771732 sec
+     1,381,853,144      cycles                           #    1.782 GHz                       
+     1,973,253,744      instructions                     #    1.43  insn per cycle            
+       0.776188665 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1540) (512y:   75) (512z: 2387)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
index 67eac99bab..df00e76677 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:04
+DATE: 2025-09-24_07:57:13
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.866343e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.840904e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.947003e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.374602e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.850583e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.886296e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.463809 sec
-INFO: No Floating Point Exceptions have been reported
-     1,942,418,108      cycles                           #    2.861 GHz                    
-     2,721,411,859      instructions                     #    1.40  insn per cycle         
-       0.812650633 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.494162 sec
+     2,088,078,987      cycles                           #    2.820 GHz                       
+     2,936,679,024      instructions                     #    1.41  insn per cycle            
+       0.798930083 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.997280e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.399599e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.603946e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.538885 sec
-INFO: No Floating Point Exceptions have been reported
-     2,239,160,610      cycles                           #    2.873 GHz                    
-     3,203,384,758      instructions                     #    1.43  insn per cycle         
-       0.836856412 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 80
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
 Avg ME (F77/GPU)   = 0.14247482467490466
 Relative difference = 5.286902838873106e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.060643e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.083213e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.083213e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.011141e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.032924e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.032924e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.565121 sec
-INFO: No Floating Point Exceptions have been reported
-     4,623,795,988      cycles                           #    2.948 GHz                    
-    13,181,888,102      instructions                     #    2.85  insn per cycle         
-       1.571833324 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  692) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.640764 sec
+     4,726,378,497      cycles                           #    2.875 GHz                       
+    13,388,410,697      instructions                     #    2.83  insn per cycle            
+       1.644936414 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  746) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499481
 Relative difference = 5.286896511435107e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.878003e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.949625e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.949625e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.733886e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.797913e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.797913e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.892385 sec
-INFO: No Floating Point Exceptions have been reported
-     2,641,116,720      cycles                           #    2.947 GHz                    
-     7,555,506,374      instructions                     #    2.86  insn per cycle         
-       0.899472366 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3093) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.964517 sec
+     2,777,828,465      cycles                           #    2.870 GHz                       
+     7,808,249,189      instructions                     #    2.81  insn per cycle            
+       0.968518484 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3022) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467499475
 Relative difference = 5.286896515331313e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.178148e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.383095e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.383095e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.010096e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.204120e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.204120e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.535311 sec
-INFO: No Floating Point Exceptions have been reported
-     1,491,222,481      cycles                           #    2.767 GHz                    
-     3,161,019,864      instructions                     #    2.12  insn per cycle         
-       0.541387025 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2976) (512y:    0) (512z:    0)
+TOTAL       :     0.564633 sec
+     1,525,088,852      cycles                           #    2.684 GHz                       
+     3,238,233,498      instructions                     #    2.12  insn per cycle            
+       0.568864130 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2894) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.523592e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.778898e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.778898e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.162469e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.377065e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.377065e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.485060 sec
-INFO: No Floating Point Exceptions have been reported
-     1,349,314,232      cycles                           #    2.763 GHz                    
-     3,012,812,614      instructions                     #    2.23  insn per cycle         
-       0.489068736 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2726) (512y:  104) (512z:    0)
+TOTAL       :     0.538225 sec
+     1,445,712,987      cycles                           #    2.669 GHz                       
+     3,111,497,224      instructions                     #    2.15  insn per cycle            
+       0.542313151 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2713) (512y:   51) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.347943e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.459729e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.459729e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.241673e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.349775e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.349775e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.720112 sec
-INFO: No Floating Point Exceptions have been reported
-     1,326,103,986      cycles                           #    1.833 GHz                    
-     1,962,664,460      instructions                     #    1.48  insn per cycle         
-       0.726078775 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1356) (512y:  106) (512z: 2218)
+TOTAL       :     0.753239 sec
+     1,337,847,804      cycles                           #    1.768 GHz                       
+     1,935,134,575      instructions                     #    1.45  insn per cycle            
+       0.757158715 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1524) (512y:   75) (512z: 2387)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482467492589
 Relative difference = 5.286901348574438e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling
new file mode 100644
index 0000000000..246da016ac
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+DATE: 2025-09-24_08:21:36
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+6.410931e+05    1 256
+1.277149e+06    2 256
+2.468261e+06    4 256
+4.940702e+06    8 256
+8.703302e+06   16 256
+1.240333e+07   32 256
+1.176459e+07   64 256
+1.135181e+07  128 256
+1.232077e+07  256 256
+1.259780e+07  512 256
+1.305679e+07 1024 256
+### GPU: scaling test 32
+8.109047e+04    1  32
+1.670604e+05    2  32
+3.290091e+05    4  32
+6.250900e+05    8  32
+1.280682e+06   16  32
+2.510148e+06   32  32
+5.005487e+06   64  32
+8.591415e+06  128  32
+1.209171e+07  256  32
+1.153836e+07  512  32
+1.111319e+07 1024  32
+1.175548e+07 2048  32
+1.195470e+07 4096  32
+1.227044e+07 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.905691e+04    1 256
+1.034121e+05    2 256
+1.047026e+05    4 256
+### CPU: scaling test 32
+9.038706e+04    1  32
+9.069344e+04    2  32
+1.041458e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.561891e+05    1 256
+2.730252e+05    2 256
+2.836441e+05    4 256
+### CPU: scaling test 32
+2.501329e+05    1  32
+2.570952e+05    2  32
+2.554758e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+5.750509e+05    1 256
+6.287586e+05    2 256
+6.269094e+05    4 256
+### CPU: scaling test 32
+5.674363e+05    1  32
+5.794110e+05    2  32
+5.625484e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+6.160214e+05    1 256
+6.181977e+05    2 256
+6.142127e+05    4 256
+### CPU: scaling test 32
+6.002739e+05    1  32
+6.110485e+05    2  32
+4.928480e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+4.823290e+05    1 256
+4.859783e+05    2 256
+4.896342e+05    4 256
+### CPU: scaling test 32
+4.753134e+05    1  32
+4.306893e+05    2  32
+4.931574e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
index fa95ebd131..0aa114dc67 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:45
+DATE: 2025-09-24_07:58:23
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.818001e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.982501e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.122889e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.029737e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.178192e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.192162e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.452282 sec
-INFO: No Floating Point Exceptions have been reported
-     1,920,727,034      cycles                           #    2.860 GHz                    
-     2,694,517,558      instructions                     #    1.40  insn per cycle         
-       0.728408510 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 169
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.470353 sec
+     2,024,797,775      cycles                           #    2.821 GHz                       
+     2,795,474,884      instructions                     #    1.38  insn per cycle            
+       0.775690419 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.287877e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.320334e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.683236e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.571360e+02 +- 2.114020e+02 )  GeV^-2
-TOTAL       :     0.495314 sec
-INFO: No Floating Point Exceptions have been reported
-     2,079,539,950      cycles                           #    2.850 GHz                    
-     2,952,237,418      instructions                     #    1.42  insn per cycle         
-       0.786339466 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 54
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 50
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424226e-01
-Avg ME (F77/GPU)   = 0.14247487904286338
-Relative difference = 0.0003670698531228044
+Avg ME (F77/GPU)   = 0.14247487915986667
+Relative difference = 0.00036707067464478155
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.109567e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.134660e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.134660e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.041265e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.064683e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.064683e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.494759 sec
-INFO: No Floating Point Exceptions have been reported
-     4,403,081,916      cycles                           #    2.940 GHz                    
-    12,951,948,710      instructions                     #    2.94  insn per cycle         
-       1.498420981 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  645) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.592374 sec
+     4,584,720,385      cycles                           #    2.874 GHz                       
+    13,173,886,622      instructions                     #    2.87  insn per cycle            
+       1.596196458 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  692) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
-Avg ME (F77/C++)    = 0.14246861273719524
-Relative difference = 8.940352641194861e-08
+Avg ME (F77/C++)    = 0.14246861273712064
+Relative difference = 8.940300273875148e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.886806e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.066754e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.066754e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.712937e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.876014e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.876014e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.584675 sec
-INFO: No Floating Point Exceptions have been reported
-     1,726,276,919      cycles                           #    2.937 GHz                    
-     4,542,407,737      instructions                     #    2.63  insn per cycle         
-       0.588476135 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.621667 sec
+     1,792,962,809      cycles                           #    2.869 GHz                       
+     4,663,415,265      instructions                     #    2.60  insn per cycle            
+       0.625726960 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3615) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246862329122401
 Relative difference = 1.6348320966878032e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.651382e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.346145e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.346145e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.565327e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.255556e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.255556e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.307816 sec
-INFO: No Floating Point Exceptions have been reported
-       856,647,676      cycles                           #    2.754 GHz                    
-     1,917,830,464      instructions                     #    2.24  insn per cycle         
-       0.311794908 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3580) (512y:    0) (512z:    0)
+TOTAL       :     0.312612 sec
+       851,138,638      cycles                           #    2.694 GHz                       
+     1,911,731,220      instructions                     #    2.25  insn per cycle            
+       0.316680357 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3459) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.083995e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.890169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.890169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.826176e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.593217e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.593217e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.287118 sec
-INFO: No Floating Point Exceptions have been reported
-       801,284,784      cycles                           #    2.760 GHz                    
-     1,834,043,941      instructions                     #    2.29  insn per cycle         
-       0.290894624 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3400) (512y:   22) (512z:    0)
+TOTAL       :     0.299534 sec
+       816,595,199      cycles                           #    2.696 GHz                       
+     1,843,533,646      instructions                     #    2.26  insn per cycle            
+       0.303518277 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3333) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.500723e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.948038e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.948038e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.425940e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.876492e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.876492e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.384030 sec
-INFO: No Floating Point Exceptions have been reported
-       726,928,592      cycles                           #    1.877 GHz                    
-     1,308,660,654      instructions                     #    1.80  insn per cycle         
-       0.387900268 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1964) (512y:   24) (512z: 2435)
+TOTAL       :     0.390882 sec
+       717,534,817      cycles                           #    1.821 GHz                       
+     1,247,139,337      instructions                     #    1.74  insn per cycle            
+       0.394851024 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2570)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491576758442
 Relative difference = 1.1066920862943416e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
index 5a6a874489..e0303b7ed7 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,272 +10,220 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:39:24
+DATE: 2025-09-24_08:57:58
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 10 --bridge OMP=
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.958276e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.362856e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.362856e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.669508e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.781327e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.781327e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.017654e+01 +- 1.429183e+01 )  GeV^-2
-TOTAL       :     0.467586 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,958,269,299      cycles                           #    2.868 GHz                    
-     2,873,921,299      instructions                     #    1.47  insn per cycle         
-       0.741370031 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
+TOTAL       :     0.482541 sec
+     2,080,194,939      cycles                           #    2.814 GHz                       
+     2,973,065,127      instructions                     #    1.43  insn per cycle            
+       0.797267871 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 --bridge
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384)
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 169
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
-.........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1 --bridge OMP=
+==PROF== Profiling "diagram1": launch__registers_per_thread 54
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
 WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
 WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288)
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.867040e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.953002e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.953002e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.609941e+02 +- 2.115589e+02 )  GeV^-2
-TOTAL       :     0.638465 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     2,513,600,406      cycles                           #    2.877 GHz                    
-     3,810,036,638      instructions                     #    1.52  insn per cycle         
-       0.930171723 seconds time elapsed
+==PROF== Profiling "diagram2": launch__registers_per_thread 50
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost
+WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424226e-01
-Avg ME (F77/GPU)   = 0.14247487904286338
-Relative difference = 0.0003670698531228044
+Avg ME (F77/GPU)   = 0.14247487915986667
+Relative difference = 0.00036707067464478155
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.115307e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.140507e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.140507e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.033819e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.056984e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.056984e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.490082 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     4,418,597,373      cycles                           #    2.958 GHz                    
-    12,956,387,401      instructions                     #    2.93  insn per cycle         
-       1.494530314 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  645) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.608143 sec
+     4,605,847,309      cycles                           #    2.859 GHz                       
+    13,177,560,156      instructions                     #    2.86  insn per cycle            
+       1.612112149 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  692) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
-Avg ME (F77/C++)    = 0.14246861273719524
-Relative difference = 8.940352641194861e-08
+Avg ME (F77/C++)    = 0.14246861273712064
+Relative difference = 8.940300273875148e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.871197e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.051268e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.051268e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.693966e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.860851e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.860851e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.592243 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-     1,749,393,716      cycles                           #    2.936 GHz                    
-     4,590,457,409      instructions                     #    2.62  insn per cycle         
-       0.596762261 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3627) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.631429 sec
+     1,821,115,428      cycles                           #    2.868 GHz                       
+     4,710,567,012      instructions                     #    2.59  insn per cycle            
+       0.635587274 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3615) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246862329122401
 Relative difference = 1.6348320966878032e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.650062e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.340176e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.340176e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.470965e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.164241e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.164241e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.311783 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-       875,769,738      cycles                           #    2.776 GHz                    
-     1,954,803,706      instructions                     #    2.23  insn per cycle         
-       0.316080972 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3580) (512y:    0) (512z:    0)
+TOTAL       :     0.322773 sec
+       876,800,208      cycles                           #    2.687 GHz                       
+     1,948,167,782      instructions                     #    2.22  insn per cycle            
+       0.326941987 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3459) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.042794e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.845843e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.845843e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.766714e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.528672e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.528672e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.293361 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-       825,335,769      cycles                           #    2.779 GHz                    
-     1,870,845,111      instructions                     #    2.27  insn per cycle         
-       0.297556229 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3400) (512y:   22) (512z:    0)
+TOTAL       :     0.306939 sec
+       839,001,914      cycles                           #    2.704 GHz                       
+     1,879,706,999      instructions                     #    2.24  insn per cycle            
+       0.310854322 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3333) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-WARNING! Instantiate host Bridge (nevt=16384)
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe -p 64 256 10 --bridge OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.484934e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.935540e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.935540e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.381614e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.819126e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.819126e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.390040 sec
-INFO: No Floating Point Exceptions have been reported
-INFO: No Floating Point Exceptions have been reported
-       749,752,693      cycles                           #    1.904 GHz                    
-     1,350,296,093      instructions                     #    1.80  insn per cycle         
-       0.394449871 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1964) (512y:   24) (512z: 2435)
+TOTAL       :     0.399460 sec
+       742,424,746      cycles                           #    1.843 GHz                       
+     1,287,364,599      instructions                     #    1.73  insn per cycle            
+       0.403527066 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2075) (512y:    5) (512z: 2570)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491576758442
 Relative difference = 1.1066920862943416e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
index cea07bf7e8..bb4764fe5e 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:58
+DATE: 2025-09-24_07:58:43
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.801672e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.945717e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.092440e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.032335e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.184891e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.198749e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018174e+01 +- 1.429492e+01 )  GeV^-2
-TOTAL       :     0.453252 sec
-INFO: No Floating Point Exceptions have been reported
-     1,914,636,683      cycles                           #    2.859 GHz                    
-     2,699,162,883      instructions                     #    1.41  insn per cycle         
-       0.727606605 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 169
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.471162 sec
+     2,022,729,281      cycles                           #    2.819 GHz                       
+     2,796,281,381      instructions                     #    1.38  insn per cycle            
+       0.775833852 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.322683e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.438723e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.801307e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.571360e+02 +- 2.114020e+02 )  GeV^-2
-TOTAL       :     0.493317 sec
-INFO: No Floating Point Exceptions have been reported
-     2,100,361,107      cycles                           #    2.862 GHz                    
-     2,955,351,040      instructions                     #    1.41  insn per cycle         
-       0.791031778 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 54
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 50
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424226e-01
-Avg ME (F77/GPU)   = 0.14247487904286338
-Relative difference = 0.0003670698531228044
+Avg ME (F77/GPU)   = 0.14247487915986667
+Relative difference = 0.00036707067464478155
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.112466e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.138003e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.138003e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.040661e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.064089e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.064089e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     1.490381 sec
-INFO: No Floating Point Exceptions have been reported
-     4,405,341,411      cycles                           #    2.950 GHz                    
-    12,928,117,316      instructions                     #    2.93  insn per cycle         
-       1.494164072 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  630) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.592943 sec
+     4,581,299,500      cycles                           #    2.870 GHz                       
+    13,170,651,939      instructions                     #    2.87  insn per cycle            
+       1.596909897 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  686) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
-Avg ME (F77/C++)    = 0.14246861273719524
-Relative difference = 8.940352641194861e-08
+Avg ME (F77/C++)    = 0.14246861273712064
+Relative difference = 8.940300273875148e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.897278e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.076728e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.076728e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.701985e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.865053e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.865053e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018564e+01 +- 1.429903e+01 )  GeV^-2
-TOTAL       :     0.582482 sec
-INFO: No Floating Point Exceptions have been reported
-     1,724,294,786      cycles                           #    2.945 GHz                    
-     4,536,655,836      instructions                     #    2.63  insn per cycle         
-       0.586223274 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3611) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.624193 sec
+     1,795,013,511      cycles                           #    2.860 GHz                       
+     4,662,177,938      instructions                     #    2.60  insn per cycle            
+       0.628349991 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3609) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424686e-01
 Avg ME (F77/C++)    = 0.14246862329122401
 Relative difference = 1.6348320966878032e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.690817e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.397497e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.397497e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.294900e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.922794e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.922794e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.305315 sec
-INFO: No Floating Point Exceptions have been reported
-       857,155,838      cycles                           #    2.779 GHz                    
-     1,914,615,212      instructions                     #    2.23  insn per cycle         
-       0.309003061 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3549) (512y:    0) (512z:    0)
+TOTAL       :     0.328192 sec
+       851,932,871      cycles                           #    2.569 GHz                       
+     1,910,738,870      instructions                     #    2.24  insn per cycle            
+       0.332201917 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3441) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.056800e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.870570e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.870570e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.776657e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.523494e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.523494e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018828e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.288177 sec
-INFO: No Floating Point Exceptions have been reported
-       804,254,194      cycles                           #    2.761 GHz                    
-     1,829,977,116      instructions                     #    2.28  insn per cycle         
-       0.291930002 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3364) (512y:   22) (512z:    0)
+TOTAL       :     0.301663 sec
+       816,360,354      cycles                           #    2.677 GHz                       
+     1,842,622,689      instructions                     #    2.26  insn per cycle            
+       0.305601859 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3313) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491543012991
 Relative difference = 1.0830068962165901e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.550897e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.994144e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.994144e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.462455e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.903984e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.903984e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018829e+01 +- 1.429922e+01 )  GeV^-2
-TOTAL       :     0.380837 sec
-INFO: No Floating Point Exceptions have been reported
-       727,485,601      cycles                           #    1.894 GHz                    
-     1,306,171,995      instructions                     #    1.80  insn per cycle         
-       0.384559776 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1928) (512y:   24) (512z: 2435)
+TOTAL       :     0.387603 sec
+       714,478,577      cycles                           #    1.828 GHz                       
+     1,245,972,447      instructions                     #    1.74  insn per cycle            
+       0.391617356 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2055) (512y:    5) (512z: 2570)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247491576758442
 Relative difference = 1.1066920862943416e-07
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling
new file mode 100644
index 0000000000..f37650a768
--- /dev/null
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.scaling
@@ -0,0 +1,137 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
+
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+BACKEND=cpp512y (was cppauto)
+OMPFLAGS=
+FPTYPE='m'
+HELINL='0'
+HRDCOD='0'
+HASCURAND=hasCurand
+HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
+Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
+make: Nothing to be done for 'gtestlibs'.
+
+make USEBUILDDIR=1 BACKEND=cuda
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppnone
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppsse4
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cppavx2
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512y
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+make USEBUILDDIR=1 BACKEND=cpp512z
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Nothing to be done for 'all'.
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+
+DATE: 2025-09-24_08:21:14
+
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
+On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe
+### GPU: scaling test 256
+6.159458e+05    1 256
+1.208237e+06    2 256
+2.338380e+06    4 256
+4.313977e+06    8 256
+6.563882e+06   16 256
+5.902898e+06   32 256
+4.772882e+06   64 256
+4.983398e+06  128 256
+5.014489e+06  256 256
+5.095821e+06  512 256
+5.127323e+06 1024 256
+### GPU: scaling test 32
+8.355877e+04    1  32
+1.680315e+05    2  32
+3.138740e+05    4  32
+6.033111e+05    8  32
+1.307680e+06   16  32
+2.486421e+06   32  32
+4.782041e+06   64  32
+6.585802e+06  128  32
+5.416696e+06  256  32
+4.888900e+06  512  32
+5.023808e+06 1024  32
+4.990474e+06 2048  32
+5.087796e+06 4096  32
+5.145084e+06 8192  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+9.882800e+04    1 256
+9.861797e+04    2 256
+1.017932e+05    4 256
+### CPU: scaling test 32
+8.655082e+04    1  32
+8.835276e+04    2  32
+9.886339e+04    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+1.755553e+05    1 256
+1.752340e+05    2 256
+1.807067e+05    4 256
+### CPU: scaling test 32
+1.569051e+05    1  32
+1.602524e+05    2  32
+1.672982e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.888491e+05    1 256
+3.042597e+05    2 256
+3.124640e+05    4 256
+### CPU: scaling test 32
+3.269877e+05    1  32
+3.302374e+05    2  32
+3.240244e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.938152e+05    1 256
+3.510081e+05    2 256
+3.508687e+05    4 256
+### CPU: scaling test 32
+2.931611e+05    1  32
+3.253190e+05    2  32
+3.534994e+05    4  32
+=========================================================================
+scalingTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe
+### CPU: scaling test 256
+2.357370e+05    1 256
+2.318594e+05    2 256
+2.369869e+05    4 256
+### CPU: scaling test 32
+2.357587e+05    1  32
+2.340439e+05    2  32
+2.340131e+05    4  32
+=========================================================================
+
+TEST COMPLETED
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
index cb0b82e9a4..b2e458fe3f 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:18
+DATE: 2025-09-24_07:57:38
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.883484e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.876597e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.990293e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.375747e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.851582e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.885501e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.458412 sec
-INFO: No Floating Point Exceptions have been reported
-     1,935,066,146      cycles                           #    2.866 GHz                    
-     2,699,989,812      instructions                     #    1.40  insn per cycle         
-       0.733387527 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.492978 sec
+     2,090,896,556      cycles                           #    2.821 GHz                       
+     2,929,508,251      instructions                     #    1.40  insn per cycle            
+       0.798572309 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.013974e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.497451e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.709351e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.541801 sec
-INFO: No Floating Point Exceptions have been reported
-     2,287,504,645      cycles                           #    2.883 GHz                    
-     3,220,826,671      instructions                     #    1.41  insn per cycle         
-       0.850636557 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 80
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
 Avg ME (F77/GPU)   = 0.14247482577104625
 Relative difference = 5.209967070245855e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.050634e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.073472e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.073472e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.006109e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.027526e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.027526e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.580404 sec
-INFO: No Floating Point Exceptions have been reported
-     4,643,189,098      cycles                           #    2.932 GHz                    
-    13,180,741,468      instructions                     #    2.84  insn per cycle         
-       1.584505840 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  681) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.649213 sec
+     4,749,858,129      cycles                           #    2.875 GHz                       
+    13,378,773,035      instructions                     #    2.82  insn per cycle            
+       1.653102536 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  751) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482734618697
 Relative difference = 5.099411406595165e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.871761e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.941517e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.941517e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.760665e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.827247e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.827247e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.895197 sec
-INFO: No Floating Point Exceptions have been reported
-     2,647,990,030      cycles                           #    2.947 GHz                    
-     7,474,565,418      instructions                     #    2.82  insn per cycle         
-       0.899253220 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3152) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.950509 sec
+     2,735,086,948      cycles                           #    2.867 GHz                       
+     7,712,753,081      instructions                     #    2.82  insn per cycle            
+       0.954580383 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3103) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482734618697
-Relative difference = 5.099411406595165e-07
+Avg ME (F77/C++)    = 0.14247482733329694
+Relative difference = 5.100316128927506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.201825e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.415489e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.415489e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.009534e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.203444e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.203444e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.532048 sec
-INFO: No Floating Point Exceptions have been reported
-     1,472,019,476      cycles                           #    2.748 GHz                    
-     3,129,064,583      instructions                     #    2.13  insn per cycle         
-       0.536341858 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3133) (512y:    0) (512z:    0)
+TOTAL       :     0.564962 sec
+     1,496,426,483      cycles                           #    2.634 GHz                       
+     3,194,660,103      instructions                     #    2.13  insn per cycle            
+       0.568933388 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3041) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.569463e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.831852e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.831852e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.240557e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.464165e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.464165e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.479328 sec
-INFO: No Floating Point Exceptions have been reported
-     1,320,483,901      cycles                           #    2.736 GHz                    
-     2,983,197,107      instructions                     #    2.26  insn per cycle         
-       0.483280271 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2895) (512y:  110) (512z:    0)
+TOTAL       :     0.525679 sec
+     1,411,813,396      cycles                           #    2.668 GHz                       
+     3,068,209,697      instructions                     #    2.17  insn per cycle            
+       0.529681189 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2857) (512y:   57) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.268192e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.372574e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.372574e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.244340e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.353724e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.353724e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.745303 sec
-INFO: No Floating Point Exceptions have been reported
-     1,365,795,021      cycles                           #    1.824 GHz                    
-     1,991,870,632      instructions                     #    1.46  insn per cycle         
-       0.749335143 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1679) (512y:  108) (512z: 2251)
+TOTAL       :     0.752766 sec
+     1,326,592,697      cycles                           #    1.755 GHz                       
+     1,920,429,812      instructions                     #    1.45  insn per cycle            
+       0.756964252 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1504) (512y:   61) (512z: 2443)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
index 222758fe32..cbf9654863 100644
--- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux'
 
-DATE: 2024-10-06_09:06:32
+DATE: 2025-09-24_07:58:02
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.879429e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.807541e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.902111e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.398861e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.873691e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.908270e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.462530 sec
-INFO: No Floating Point Exceptions have been reported
-     1,930,179,746      cycles                           #    2.847 GHz                    
-     2,724,788,037      instructions                     #    1.41  insn per cycle         
-       0.736957830 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.494658 sec
+     2,092,705,523      cycles                           #    2.813 GHz                       
+     2,940,970,011      instructions                     #    1.41  insn per cycle            
+       0.801023593 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.958663e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.373563e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.576283e+07                 )  sec^-1
-MeanMatrixElemValue         = ( 2.602505e+02 +- 2.116328e+02 )  GeV^-2
-TOTAL       :     0.543078 sec
-INFO: No Floating Point Exceptions have been reported
-     2,226,045,922      cycles                           #    2.831 GHz                    
-     3,151,460,121      instructions                     #    1.42  insn per cycle         
-       0.843097781 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 80
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 74
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.424749e-01
 Avg ME (F77/GPU)   = 0.14247482577104625
 Relative difference = 5.209967070245855e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.049471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.072251e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.072251e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.997666e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.021105e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.021105e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     1.581795 sec
-INFO: No Floating Point Exceptions have been reported
-     4,647,850,638      cycles                           #    2.932 GHz                    
-    13,168,659,581      instructions                     #    2.83  insn per cycle         
-       1.585735048 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  666) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.659397 sec
+     4,758,579,494      cycles                           #    2.862 GHz                       
+    13,376,500,790      instructions                     #    2.81  insn per cycle            
+       1.663454307 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  746) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
 Avg ME (F77/C++)    = 0.14247482734618697
 Relative difference = 5.099411406595165e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.863863e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.934907e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.934907e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.766458e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.832690e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.832690e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.898950 sec
-INFO: No Floating Point Exceptions have been reported
-     2,647,565,316      cycles                           #    2.935 GHz                    
-     7,477,127,209      instructions                     #    2.82  insn per cycle         
-       0.902852166 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3141) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.946987 sec
+     2,726,321,707      cycles                           #    2.869 GHz                       
+     7,710,654,552      instructions                     #    2.83  insn per cycle            
+       0.951101922 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3098) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482734618697
-Relative difference = 5.099411406595165e-07
+Avg ME (F77/C++)    = 0.14247482733329694
+Relative difference = 5.100316128927506e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.193877e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.403471e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.403471e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.061100e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.260572e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.260572e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.533092 sec
-INFO: No Floating Point Exceptions have been reported
-     1,474,101,191      cycles                           #    2.747 GHz                    
-     3,129,731,788      instructions                     #    2.12  insn per cycle         
-       0.537323582 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3111) (512y:    0) (512z:    0)
+TOTAL       :     0.555271 sec
+     1,496,591,734      cycles                           #    2.679 GHz                       
+     3,193,488,754      instructions                     #    2.13  insn per cycle            
+       0.559342364 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3027) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.595782e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.860984e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.860984e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.243284e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.466918e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.466918e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.475643 sec
-INFO: No Floating Point Exceptions have been reported
-     1,319,166,719      cycles                           #    2.754 GHz                    
-     2,983,572,989      instructions                     #    2.26  insn per cycle         
-       0.479589426 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2871) (512y:  110) (512z:    0)
+TOTAL       :     0.525318 sec
+     1,415,387,088      cycles                           #    2.677 GHz                       
+     3,066,968,624      instructions                     #    2.17  insn per cycle            
+       0.529321884 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2841) (512y:   57) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe -p 64 256 10 OMP=
+Process                     = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.265955e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.372021e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.372021e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.256974e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.366147e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.366147e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.018083e+01 +- 1.429474e+01 )  GeV^-2
-TOTAL       :     0.745301 sec
-INFO: No Floating Point Exceptions have been reported
-     1,365,993,831      cycles                           #    1.825 GHz                    
-     1,991,757,917      instructions                     #    1.46  insn per cycle         
-       0.749395729 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:  108) (512z: 2251)
+TOTAL       :     0.748391 sec
+     1,323,661,803      cycles                           #    1.761 GHz                       
+     1,919,358,353      instructions                     #    1.45  insn per cycle            
+       0.752535445 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1488) (512y:   61) (512z: 2443)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 5 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 5 channels { 1 : 128, 2 : 96, 3 : 96, 4 : 96, 5 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.424749e-01
-Avg ME (F77/C++)    = 0.14247482643254802
-Relative difference = 5.163537715318965e-07
+Avg ME (F77/C++)    = 0.14247482641080925
+Relative difference = 5.165063512315125e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
index 9b3f75797b..1bf17797d0 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:01:13
+DATE: 2025-09-24_09:36:21
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.147069e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.778623e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.394888e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.507440e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.405568e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.464017e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.535117 sec
-INFO: No Floating Point Exceptions have been reported
-     2,222,375,781      cycles                           #    2.890 GHz                    
-     3,181,150,200      instructions                     #    1.43  insn per cycle         
-       0.828824866 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 228
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.640440 sec
+     2,809,701,143      cycles                           #    2.827 GHz                       
+     4,494,636,207      instructions                     #    1.60  insn per cycle            
+       1.053147110 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 84
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
 Avg ME (F77/GPU)   = 4.3134710926110280
 Relative difference = 2.1036162329561614e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.628496e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.666122e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.666122e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.576789e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.612176e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.612176e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.554723 sec
-INFO: No Floating Point Exceptions have been reported
-    19,293,957,259      cycles                           #    2.941 GHz                    
-    51,936,518,995      instructions                     #    2.69  insn per cycle         
-       6.561734499 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  668) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.761250 sec
+    19,468,040,698      cycles                           #    2.878 GHz                       
+    52,866,873,079      instructions                     #    2.72  insn per cycle            
+       6.766705663 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  705) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.914767e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.044981e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.044981e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.667809e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.780884e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.780884e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.713846 sec
-INFO: No Floating Point Exceptions have been reported
-    10,942,394,234      cycles                           #    2.942 GHz                    
-    30,809,451,561      instructions                     #    2.82  insn per cycle         
-       3.720459537 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2915) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.041971 sec
+    11,631,270,997      cycles                           #    2.874 GHz                       
+    32,470,600,741      instructions                     #    2.79  insn per cycle            
+       4.047477918 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3252) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.701521e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.038587e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.038587e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.446930e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.756366e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.756366e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.349075 sec
-INFO: No Floating Point Exceptions have been reported
-     6,518,044,155      cycles                           #    2.767 GHz                    
-    13,691,830,614      instructions                     #    2.10  insn per cycle         
-       2.356266703 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2941) (512y:    0) (512z:    0)
+TOTAL       :     2.469249 sec
+     6,658,104,030      cycles                           #    2.692 GHz                       
+    13,758,243,349      instructions                     #    2.07  insn per cycle            
+       2.474625313 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3017) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.169544e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.582169e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.582169e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.710312e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.054677e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.054677e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.151067 sec
-INFO: No Floating Point Exceptions have been reported
-     5,973,431,908      cycles                           #    2.768 GHz                    
-    13,032,735,919      instructions                     #    2.18  insn per cycle         
-       2.158817844 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2667) (512y:  146) (512z:    0)
+TOTAL       :     2.336944 sec
+     6,266,032,072      cycles                           #    2.676 GHz                       
+    13,326,118,877      instructions                     #    2.13  insn per cycle            
+       2.342347316 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2845) (512y:   49) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.442417e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.620453e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.620453e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.171604e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.321823e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.321823e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.166223 sec
-INFO: No Floating Point Exceptions have been reported
-     5,879,580,303      cycles                           #    1.853 GHz                    
-     8,614,888,302      instructions                     #    1.47  insn per cycle         
-       3.173636028 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1506) (512y:  128) (512z: 1946)
+TOTAL       :     3.416630 sec
+     6,077,886,706      cycles                           #    1.777 GHz                       
+     8,372,353,704      instructions                     #    1.38  insn per cycle            
+       3.422247419 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1562) (512y:   71) (512z: 2101)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
index fe94934cb0..274cf5b9f8 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:01:39
+DATE: 2025-09-24_09:36:59
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.155696e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.811430e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.416776e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.495635e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.395599e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.454803e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.531667 sec
-INFO: No Floating Point Exceptions have been reported
-     2,222,115,079      cycles                           #    2.893 GHz                    
-     3,196,008,298      instructions                     #    1.44  insn per cycle         
-       0.825144177 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 216
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.641751 sec
+     2,814,921,189      cycles                           #    2.828 GHz                       
+     4,486,858,306      instructions                     #    1.59  insn per cycle            
+       1.054520115 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 85
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
 Avg ME (F77/GPU)   = 4.3134710926110280
 Relative difference = 2.1036162329561614e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.710634e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.751435e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.751435e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.649275e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.688267e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.688267e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.246482 sec
-INFO: No Floating Point Exceptions have been reported
-    18,390,828,933      cycles                           #    2.942 GHz                    
-    50,070,723,541      instructions                     #    2.72  insn per cycle         
-       6.253313848 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  626) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.468573 sec
+    18,605,591,168      cycles                           #    2.874 GHz                       
+    50,910,754,622      instructions                     #    2.74  insn per cycle            
+       6.474203070 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  635) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.069031e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.214398e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.214398e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.706155e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.823800e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.823800e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.535906 sec
-INFO: No Floating Point Exceptions have been reported
-    10,415,008,507      cycles                           #    2.940 GHz                    
-    29,198,189,749      instructions                     #    2.80  insn per cycle         
-       3.543300262 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2733) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.985573 sec
+    11,479,073,541      cycles                           #    2.877 GHz                       
+    31,953,213,577      instructions                     #    2.78  insn per cycle            
+       3.991237832 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3206) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926105795
 Relative difference = 2.1036172727915933e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.327920e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.613203e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.613203e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.472224e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.788973e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.788973e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.541514 sec
-INFO: No Floating Point Exceptions have been reported
-     7,032,477,509      cycles                           #    2.760 GHz                    
-    15,175,173,386      instructions                     #    2.16  insn per cycle         
-       2.548867076 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3020) (512y:    0) (512z:    0)
+TOTAL       :     2.455792 sec
+     6,602,535,558      cycles                           #    2.684 GHz                       
+    13,513,270,122      instructions                     #    2.05  insn per cycle            
+       2.461238995 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2933) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.529226e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.840126e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.840126e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.769500e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.125335e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.125335e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.433970 sec
-INFO: No Floating Point Exceptions have been reported
-     6,732,593,285      cycles                           #    2.759 GHz                    
-    14,647,151,783      instructions                     #    2.18  insn per cycle         
-       2.441354685 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2621) (512y:  302) (512z:    0)
+TOTAL       :     2.308652 sec
+     6,213,743,119      cycles                           #    2.688 GHz                       
+    13,125,333,077      instructions                     #    2.11  insn per cycle            
+       2.314078971 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2770) (512y:   49) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.326729e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.490201e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.490201e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.175393e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.326517e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.326517e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.271504 sec
-INFO: No Floating Point Exceptions have been reported
-     6,070,928,941      cycles                           #    1.852 GHz                    
-    10,360,391,243      instructions                     #    1.71  insn per cycle         
-       3.278977914 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1268) (512y:  214) (512z: 2129)
+TOTAL       :     3.412394 sec
+     6,060,342,842      cycles                           #    1.774 GHz                       
+     8,345,690,212      instructions                     #    1.38  insn per cycle            
+       3.418017007 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   71) (512z: 2096)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
 Avg ME (F77/C++)    = 4.3134710926107935
 Relative difference = 2.103616776553298e-07
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
index 8cd2c74f38..5df5f2e017 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,242 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:03:02
+DATE: 2025-09-24_09:38:46
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.465620e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.510965e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.608079e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.672770e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.850350e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.862111e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 7.154219e+00 +- 1.620281e-01 )  GeV^0
-TOTAL       :     0.492110 sec
-INFO: No Floating Point Exceptions have been reported
-     2,084,727,455      cycles                           #    2.877 GHz                    
-     2,955,736,176      instructions                     #    1.42  insn per cycle         
-       0.784112386 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 131
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.546796 sec
+     2,347,767,839      cycles                           #    2.827 GHz                       
+     3,518,985,878      instructions                     #    1.50  insn per cycle            
+       0.889028718 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 60
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 64
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313490e+00
-Avg ME (F77/GPU)   = 4.3136695491848513
-Relative difference = 4.162503792787837e-05
+Avg ME (F77/GPU)   = 4.3136696910951287
+Relative difference = 4.165793710634106e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.686557e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.727704e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.727704e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.603617e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.641812e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.641812e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175644e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     6.305463 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    18,635,302,225      cycles                           #    2.953 GHz                    
-    51,219,407,083      instructions                     #    2.75  insn per cycle         
-       6.310992251 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  625) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.628001 sec
+    19,095,296,333      cycles                           #    2.879 GHz                       
+    52,570,684,440      instructions                     #    2.75  insn per cycle            
+       6.633186048 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  694) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313574e+00
 Avg ME (F77/C++)    = 4.3135738277342170
 Relative difference = 3.9935743068669333e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.043062e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.307407e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.307407e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.650976e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.873298e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.873298e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175642e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     2.681205 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,932,523,130      cycles                           #    2.953 GHz                    
-    19,317,767,787      instructions                     #    2.44  insn per cycle         
-       2.686665617 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3542) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.961111 sec
+     8,533,073,654      cycles                           #    2.878 GHz                       
+    20,129,758,419      instructions                     #    2.36  insn per cycle            
+       2.966298075 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3870) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313572e+00
 Avg ME (F77/C++)    = 4.3135722697479650
 Relative difference = 6.253470796314402e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 7.901471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.926003e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.926003e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.018131e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.075080e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.075080e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.413719 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,953,020,388      cycles                           #    2.786 GHz                    
-     8,832,668,299      instructions                     #    2.23  insn per cycle         
-       1.419629254 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3715) (512y:    0) (512z:    0)
+TOTAL       :     1.395955 sec
+     3,785,475,901      cycles                           #    2.703 GHz                       
+     8,259,140,872      instructions                     #    2.18  insn per cycle            
+       1.401234976 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3564) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.392997e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.544307e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.544307e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.448216e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.626503e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.626503e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.337803 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,735,491,375      cycles                           #    2.782 GHz                    
-     8,430,906,889      instructions                     #    2.26  insn per cycle         
-       1.343508069 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3541) (512y:   20) (512z:    0)
+TOTAL       :     1.329067 sec
+     3,609,245,916      cycles                           #    2.707 GHz                       
+     8,052,417,503      instructions                     #    2.23  insn per cycle            
+       1.334245097 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3450) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.024352e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.578236e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.578236e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.458703e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.125669e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.125669e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     1.827995 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     3,508,723,607      cycles                           #    1.915 GHz                    
-     6,244,798,669      instructions                     #    1.78  insn per cycle         
-       1.833521857 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2325) (512y:   22) (512z: 2290)
+TOTAL       :     1.712295 sec
+     3,171,872,574      cycles                           #    1.848 GHz                       
+     5,491,453,610      instructions                     #    1.73  insn per cycle            
+       1.717258512 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2147) (512y:    5) (512z: 2290)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313564e+00
-Avg ME (F77/C++)    = 4.3135643536224961
-Relative difference = 8.197919301304478e-08
+Avg ME (F77/C++)    = 4.3135642320849001
+Relative difference = 5.380351369373482e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
index 1ff1d26090..ef6aa4541d 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,246 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:03:24
+DATE: 2025-09-24_09:39:18
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.690902e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.615208e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.727767e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.671134e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.848561e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.860377e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 7.154219e+00 +- 1.620281e-01 )  GeV^0
-TOTAL       :     0.493976 sec
-INFO: No Floating Point Exceptions have been reported
-     2,066,790,877      cycles                           #    2.843 GHz                    
-     2,969,404,210      instructions                     #    1.44  insn per cycle         
-       0.785535997 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 125
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.546514 sec
+     2,342,088,165      cycles                           #    2.823 GHz                       
+     3,542,032,932      instructions                     #    1.51  insn per cycle            
+       0.888457994 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 60
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 64
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313490e+00
-Avg ME (F77/GPU)   = 4.3136695491848513
-Relative difference = 4.162503792787837e-05
+Avg ME (F77/GPU)   = 4.3136696910951287
+Relative difference = 4.165793710634106e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.736131e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.779781e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.779781e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.641763e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.681776e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.681776e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175644e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     6.127979 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    18,032,140,147      cycles                           #    2.940 GHz                    
-    49,602,643,371      instructions                     #    2.75  insn per cycle         
-       6.133935412 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  613) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.475013 sec
+    18,644,002,819      cycles                           #    2.878 GHz                       
+    51,007,439,568      instructions                     #    2.74  insn per cycle            
+       6.480509328 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  665) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313574e+00
 Avg ME (F77/C++)    = 4.3135738277342170
 Relative difference = 3.9935743068669333e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.506367e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.839198e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.839198e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.684584e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.908730e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.908730e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.175642e+00 +- 1.658767e-01 )  GeV^0
-TOTAL       :     2.414203 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,115,995,603      cycles                           #    2.942 GHz                    
-    18,533,869,751      instructions                     #    2.60  insn per cycle         
-       2.419892180 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3252) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.934573 sec
+     8,454,211,342      cycles                           #    2.877 GHz                       
+    19,835,014,469      instructions                     #    2.35  insn per cycle            
+       2.939713714 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3811) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313572e+00
 Avg ME (F77/C++)    = 4.3135722697479650
 Relative difference = 6.253470796314402e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.374488e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.825683e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.825683e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.065369e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.131864e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.131864e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     2.037733 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     5,644,200,229      cycles                           #    2.763 GHz                    
-    10,848,148,808      instructions                     #    1.92  insn per cycle         
-       2.043741542 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4274) (512y:    0) (512z:    0)
+TOTAL       :     1.387082 sec
+     3,764,013,420      cycles                           #    2.705 GHz                       
+     8,137,849,027      instructions                     #    2.16  insn per cycle            
+       1.392187853 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3474) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.433283e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.894901e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.894901e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.480903e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.668132e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.668132e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     2.017462 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     5,594,464,289      cycles                           #    2.767 GHz                    
-    10,554,918,385      instructions                     #    1.89  insn per cycle         
-       2.022782231 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 4138) (512y:   12) (512z:    0)
+TOTAL       :     1.323763 sec
+     3,586,004,374      cycles                           #    2.700 GHz                       
+     7,955,445,372      instructions                     #    2.22  insn per cycle            
+       1.328913707 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3370) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313565e+00
-Avg ME (F77/C++)    = 4.3135645242873579
-Relative difference = 1.1028294269894893e-07
+Avg ME (F77/C++)    = 4.3135645270813257
+Relative difference = 1.096352260831459e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.364066e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.648223e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.648223e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.488913e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.159953e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.159953e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.198861e+00 +- 1.710281e-01 )  GeV^0
-TOTAL       :     2.491143 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     4,639,687,839      cycles                           #    1.859 GHz                    
-     8,661,216,579      instructions                     #    1.87  insn per cycle         
-       2.496647539 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2799) (512y:    0) (512z: 2885)
+TOTAL       :     1.704713 sec
+     3,166,186,323      cycles                           #    1.853 GHz                       
+     5,481,656,768      instructions                     #    1.73  insn per cycle            
+       1.709836890 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2090) (512y:    5) (512z: 2290)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313564e+00
-Avg ME (F77/C++)    = 4.3135643536224961
-Relative difference = 8.197919301304478e-08
+Avg ME (F77/C++)    = 4.3135642320849001
+Relative difference = 5.380351369373482e-08
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
index 12c9da87af..eeeb59d618 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,246 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:02:06
+DATE: 2025-09-24_09:37:33
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.131914e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.755854e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.359452e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.517528e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.419175e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.478423e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.535565 sec
-INFO: No Floating Point Exceptions have been reported
-     2,204,224,001      cycles                           #    2.864 GHz                    
-     3,121,247,303      instructions                     #    1.42  insn per cycle         
-       0.828499405 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 228
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.634900 sec
+     2,788,760,911      cycles                           #    2.835 GHz                       
+     4,444,778,417      instructions                     #    1.59  insn per cycle            
+       1.042872548 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 84
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
-Avg ME (F77/GPU)   = 4.3134711012809239
-Relative difference = 2.0835166567625394e-07
+Avg ME (F77/GPU)   = 4.3134712562812831
+Relative difference = 1.7241765260874332e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.529079e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.561968e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.561968e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.478269e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.509767e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.509767e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.973239 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    20,550,139,482      cycles                           #    2.945 GHz                    
-    51,941,635,065      instructions                     #    2.53  insn per cycle         
-       6.980082779 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  655) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     7.204058 sec
+    20,755,829,028      cycles                           #    2.880 GHz                       
+    52,917,427,628      instructions                     #    2.55  insn per cycle            
+       7.209554620 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  705) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711782756741
+Relative difference = 1.9050183377028104e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.672019e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.782339e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.782339e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.556152e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.660192e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.660192e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     4.043433 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    11,521,778,322      cycles                           #    2.845 GHz                    
-    30,615,090,868      instructions                     #    2.66  insn per cycle         
-       4.050715703 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2972) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.213212 sec
+    12,129,215,180      cycles                           #    2.876 GHz                       
+    32,047,748,942      instructions                     #    2.64  insn per cycle            
+       4.218582177 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3309) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711778081822
+Relative difference = 1.9061021324348284e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.474164e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.781347e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.781347e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.348524e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.643676e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.643676e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.469295 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,758,530,167      cycles                           #    2.729 GHz                    
-    13,653,357,404      instructions                     #    2.02  insn per cycle         
-       2.477625143 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3118) (512y:    0) (512z:    0)
+TOTAL       :     2.522477 sec
+     6,765,170,166      cycles                           #    2.677 GHz                       
+    13,545,443,870      instructions                     #    2.00  insn per cycle            
+       2.527903273 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3162) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.946193e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.312777e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.312777e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.635693e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.969889e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.969889e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.239110 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,206,585,291      cycles                           #    2.765 GHz                    
-    13,005,835,459      instructions                     #    2.10  insn per cycle         
-       2.246664710 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2851) (512y:  150) (512z:    0)
+TOTAL       :     2.372202 sec
+     6,403,479,611      cycles                           #    2.694 GHz                       
+    13,110,521,732      instructions                     #    2.05  insn per cycle            
+       2.377605412 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2989) (512y:   53) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.130780e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.276017e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.276017e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.140442e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.288410e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.288410e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.470623 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,429,525,372      cycles                           #    1.849 GHz                    
-     8,729,822,669      instructions                     #    1.36  insn per cycle         
-       3.478318009 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1792) (512y:  130) (512z: 2014)
+TOTAL       :     3.450244 sec
+     6,142,491,223      cycles                           #    1.779 GHz                       
+     8,289,889,012      instructions                     #    1.35  insn per cycle            
+       3.455551171 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1581) (512y:   61) (512z: 2145)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
index 90c964242c..2222bb6c1d 100644
--- a/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_heftggbb_mad/log_heftggbb_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,246 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx'
 
-DATE: 2024-10-06_10:02:34
+DATE: 2025-09-24_09:38:11
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.143359e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.817002e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.430401e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.474103e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.371663e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.430529e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     0.533031 sec
-INFO: No Floating Point Exceptions have been reported
-     2,222,154,822      cycles                           #    2.885 GHz                    
-     3,215,427,054      instructions                     #    1.45  insn per cycle         
-       0.826924367 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 216
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.639804 sec
+     2,818,874,599      cycles                           #    2.836 GHz                       
+     4,491,683,880      instructions                     #    1.59  insn per cycle            
+       1.052963598 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 85
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 32
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 4.313472e+00
-Avg ME (F77/GPU)   = 4.3134711012809239
-Relative difference = 2.0835166567625394e-07
+Avg ME (F77/GPU)   = 4.3134712562812831
+Relative difference = 1.7241765260874332e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.616471e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.652773e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.652773e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.543744e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.577982e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.577982e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     6.603326 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    19,494,406,226      cycles                           #    2.950 GHz                    
-    49,966,413,800      instructions                     #    2.56  insn per cycle         
-       6.609959024 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  599) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.903615 sec
+    19,875,040,203      cycles                           #    2.878 GHz                       
+    50,962,768,917      instructions                     #    2.56  insn per cycle            
+       6.909360746 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  635) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711782756741
+Relative difference = 1.9050183377028104e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.890177e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.018164e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.018164e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.566397e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.670719e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.670719e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.745798 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-    11,068,643,232      cycles                           #    2.950 GHz                    
-    29,164,471,893      instructions                     #    2.63  insn per cycle         
-       3.753005329 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2815) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     4.197287 sec
+    11,967,434,460      cycles                           #    2.849 GHz                       
+    31,529,543,072      instructions                     #    2.63  insn per cycle            
+       4.202854098 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3263) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134711778082178
-Relative difference = 1.906102050071626e-07
+Avg ME (F77/C++)    = 4.3134711778081822
+Relative difference = 1.9061021324348284e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.744994e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.955254e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.955254e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.400358e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.703188e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.703188e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.917714 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     8,087,123,435      cycles                           #    2.766 GHz                    
-    15,210,355,188      instructions                     #    1.88  insn per cycle         
-       2.924634632 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3203) (512y:    0) (512z:    0)
+TOTAL       :     2.493209 sec
+     6,717,361,260      cycles                           #    2.689 GHz                       
+    13,300,799,614      instructions                     #    1.98  insn per cycle            
+       2.498736225 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3078) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.909194e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.140218e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.140218e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.702572e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.045593e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.045593e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     2.798673 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     7,730,347,780      cycles                           #    2.756 GHz                    
-    14,498,978,915      instructions                     #    1.88  insn per cycle         
-       2.805768338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2775) (512y:  304) (512z:    0)
+TOTAL       :     2.340637 sec
+     6,332,876,477      cycles                           #    2.700 GHz                       
+    12,910,869,466      instructions                     #    2.04  insn per cycle            
+       2.346037040 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2914) (512y:   53) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_HEFT_GG_BBX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.049249e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.186111e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.186111e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.136297e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.283420e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.283420e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 7.148017e+00 +- 1.609110e-01 )  GeV^0
-TOTAL       :     3.561293 sec
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-     6,578,699,260      cycles                           #    1.844 GHz                    
-     9,927,155,424      instructions                     #    1.51  insn per cycle         
-       3.569129809 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1565) (512y:  216) (512z: 2216)
+TOTAL       :     3.453468 sec
+     6,141,795,664      cycles                           #    1.776 GHz                       
+     8,267,854,036      instructions                     #    1.35  insn per cycle            
+       3.458910469 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1536) (512y:   61) (512z: 2140)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 4 channels { no-multichannel : 512 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 DEBUG: MEK (channelid array) processed 512 events across 4 channels { 1 : 128, 2 : 128, 3 : 128, 4 : 128 }
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
-INFO: The following Floating Point Exceptions have been reported: FE_UNDERFLOW
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/P1_gg_bbx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 4.313472e+00
-Avg ME (F77/C++)    = 4.3134712319139954
-Relative difference = 1.7806676491157786e-07
+Avg ME (F77/C++)    = 4.3134712322699498
+Relative difference = 1.7798424336580573e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
index 2b34ea67ad..9e7ebf842e 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:07
+DATE: 2025-09-24_09:33:21
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.760509e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.779507e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.782702e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.189977e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.210295e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.213498e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.473450 sec
-INFO: No Floating Point Exceptions have been reported
-     1,994,326,240      cycles                           #    2.874 GHz                    
-     2,845,102,706      instructions                     #    1.43  insn per cycle         
-       0.753810347 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.535283 sec
+     2,207,411,095      cycles                           #    2.818 GHz                       
+     3,160,583,611      instructions                     #    1.43  insn per cycle            
+       0.840627614 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.019067e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.126130e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.133988e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.490494 sec
-INFO: No Floating Point Exceptions have been reported
-     2,031,600,016      cycles                           #    2.857 GHz                    
-     2,995,319,726      instructions                     #    1.47  insn per cycle         
-       0.772627668 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 104
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 76
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
 Avg ME (F77/GPU)   = 8.1274562860176604E-006
 Relative difference = 3.3392753366481633e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.383469e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.386752e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.386752e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.307555e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.310855e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.310855e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.158264 sec
-INFO: No Floating Point Exceptions have been reported
-       469,342,334      cycles                           #    2.906 GHz                    
-     1,390,298,076      instructions                     #    2.96  insn per cycle         
-       0.162106230 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3908) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.161929 sec
+       471,695,181      cycles                           #    2.858 GHz                       
+     1,386,120,911      instructions                     #    2.94  insn per cycle            
+       0.165662610 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4201) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167185E-006
 Relative difference = 3.339276495559746e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.476358e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.488167e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.488167e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.839650e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.849559e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.849559e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.083612 sec
-INFO: No Floating Point Exceptions have been reported
-       240,584,825      cycles                           #    2.769 GHz                    
-       693,113,903      instructions                     #    2.88  insn per cycle         
-       0.087424946 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9482) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.092454 sec
+       255,607,375      cycles                           #    2.672 GHz                       
+       718,366,684      instructions                     #    2.81  insn per cycle            
+       0.096275754 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12596) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167168E-006
 Relative difference = 3.3392764976441195e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.432068e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.438681e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.438681e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.315918e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.320963e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.320963e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.038906 sec
-INFO: No Floating Point Exceptions have been reported
-       114,140,366      cycles                           #    2.711 GHz                    
-       257,891,266      instructions                     #    2.26  insn per cycle         
-       0.042661267 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8501) (512y:    0) (512z:    0)
+TOTAL       :     0.042212 sec
+       119,980,053      cycles                           #    2.646 GHz                       
+       267,131,841      instructions                     #    2.23  insn per cycle            
+       0.046071538 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8995) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.618386e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.625883e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.625883e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.473800e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.480265e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.480265e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.034671 sec
-INFO: No Floating Point Exceptions have been reported
-       102,555,024      cycles                           #    2.705 GHz                    
-       240,017,026      instructions                     #    2.34  insn per cycle         
-       0.038425016 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8143) (512y:  150) (512z:    0)
+TOTAL       :     0.037950 sec
+       108,004,494      cycles                           #    2.627 GHz                       
+       252,474,875      instructions                     #    2.34  insn per cycle            
+       0.041746760 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8827) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.192893e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.198052e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.198052e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.155565e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.160261e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.160261e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.046494 sec
-INFO: No Floating Point Exceptions have been reported
-        90,048,800      cycles                           #    1.806 GHz                    
-       134,302,710      instructions                     #    1.49  insn per cycle         
-       0.050438224 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1943) (512y:  126) (512z: 7086)
+TOTAL       :     0.047943 sec
+        90,674,116      cycles                           #    1.777 GHz                       
+       139,511,089      instructions                     #    1.54  insn per cycle            
+       0.051731540 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1725) (512y:   61) (512z: 8276)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
index dc41fe503f..39c980daa5 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:18
+DATE: 2025-09-24_09:33:51
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.797107e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.816023e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.819423e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.219837e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.236573e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.242139e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.469966 sec
-INFO: No Floating Point Exceptions have been reported
-     2,001,057,465      cycles                           #    2.881 GHz                    
-     2,930,552,926      instructions                     #    1.46  insn per cycle         
-       0.752195966 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.535604 sec
+     2,215,769,027      cycles                           #    2.824 GHz                       
+     3,174,955,614      instructions                     #    1.43  insn per cycle            
+       0.841816639 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.121137e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.233030e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.241027e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.489610 sec
-INFO: No Floating Point Exceptions have been reported
-     2,050,200,483      cycles                           #    2.873 GHz                    
-     3,056,241,818      instructions                     #    1.49  insn per cycle         
-       0.771808178 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 108
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 76
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 72
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
 Avg ME (F77/GPU)   = 8.1274562860176604E-006
 Relative difference = 3.3392753366481633e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.406266e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.409565e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.409565e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.293505e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.296702e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.296702e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.156466 sec
-INFO: No Floating Point Exceptions have been reported
-       465,689,745      cycles                           #    2.917 GHz                    
-     1,385,079,930      instructions                     #    2.97  insn per cycle         
-       0.160315659 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3796) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.161859 sec
+       469,852,209      cycles                           #    2.853 GHz                       
+     1,378,582,277      instructions                     #    2.93  insn per cycle            
+       0.165480722 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4321) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167185E-006
 Relative difference = 3.339276495559746e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.388983e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.401822e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.401822e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.784302e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.794144e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.794144e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.083799 sec
-INFO: No Floating Point Exceptions have been reported
-       238,961,924      cycles                           #    2.745 GHz                    
-       689,073,758      instructions                     #    2.88  insn per cycle         
-       0.087593094 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9525) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.092403 sec
+       253,996,770      cycles                           #    2.658 GHz                       
+       712,985,517      instructions                     #    2.81  insn per cycle            
+       0.096121941 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12756) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860167168E-006
 Relative difference = 3.3392764976441195e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.419818e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.425419e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.425419e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.317943e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.323081e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.323081e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.038479 sec
-INFO: No Floating Point Exceptions have been reported
-       111,800,811      cycles                           #    2.682 GHz                    
-       253,484,287      instructions                     #    2.27  insn per cycle         
-       0.042138594 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8457) (512y:    0) (512z:    0)
+TOTAL       :     0.041365 sec
+       117,280,788      cycles                           #    2.633 GHz                       
+       262,313,711      instructions                     #    2.24  insn per cycle            
+       0.045157630 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8941) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.620452e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.628839e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.628839e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.466443e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.472654e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.472654e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.033872 sec
-INFO: No Floating Point Exceptions have been reported
-       100,998,379      cycles                           #    2.706 GHz                    
-       235,641,730      instructions                     #    2.33  insn per cycle         
-       0.037957581 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8101) (512y:  150) (512z:    0)
+TOTAL       :     0.037331 sec
+       106,205,166      cycles                           #    2.626 GHz                       
+       247,558,941      instructions                     #    2.33  insn per cycle            
+       0.041028602 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8771) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.156678e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.161477e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.161477e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.142653e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.147394e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.147394e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.047111 sec
-INFO: No Floating Point Exceptions have been reported
-        88,066,978      cycles                           #    1.743 GHz                    
-       129,735,533      instructions                     #    1.47  insn per cycle         
-       0.051105123 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1899) (512y:  126) (512z: 7084)
+TOTAL       :     0.047796 sec
+        88,613,439      cycles                           #    1.738 GHz                       
+       134,698,543      instructions                     #    1.52  insn per cycle            
+       0.051634230 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1669) (512y:   61) (512z: 8276)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562860174791E-006
 Relative difference = 3.3392755596761116e-07
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
index 4b10dcf1d1..35461d8677 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:52
+DATE: 2025-09-24_09:35:22
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.214342e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.224285e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.226222e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 7.188141e-04 +- 6.565202e-04 )  GeV^-4
-TOTAL       :     0.476842 sec
-INFO: No Floating Point Exceptions have been reported
-     1,989,613,876      cycles                           #    2.873 GHz                    
-     2,928,089,356      instructions                     #    1.47  insn per cycle         
-       0.750924959 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 8.271224e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.283601e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.286909e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 7.188142e-04 +- 6.565203e-04 )  GeV^-4
+TOTAL       :     0.534205 sec
+     2,200,496,463      cycles                           #    2.812 GHz                       
+     3,139,642,915      instructions                     #    1.43  insn per cycle            
+       0.840870842 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.950242e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.029144e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.036217e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 8.020494e-03 +- 4.025605e-03 )  GeV^-4
-TOTAL       :     0.473909 sec
-INFO: No Floating Point Exceptions have been reported
-     1,995,145,721      cycles                           #    2.886 GHz                    
-     2,912,342,089      instructions                     #    1.46  insn per cycle         
-       0.748274226 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 70
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 50
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127250e-06
-Avg ME (F77/GPU)   = 8.1272869669930272E-006
-Relative difference = 4.548524165778887e-06
+Avg ME (F77/GPU)   = 8.1272868580195144E-006
+Relative difference = 4.535115754234733e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.462777e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.466245e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.466245e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.392974e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.396413e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.396413e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177153e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.154509 sec
-INFO: No Floating Point Exceptions have been reported
-       463,950,135      cycles                           #    2.942 GHz                    
-     1,382,102,782      instructions                     #    2.98  insn per cycle         
-       0.158280886 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3058) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.157853 sec
+       458,926,719      cycles                           #    2.854 GHz                       
+     1,383,056,756      instructions                     #    3.01  insn per cycle            
+       0.161530858 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3677) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 8.127811e-06
-Avg ME (F77/C++)    = 8.1278105271212486E-006
-Relative difference = 5.8180333155894157e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 8.127810e-06
+Avg ME (F77/C++)    = 8.1278103111816235E-006
+Relative difference = 3.8286035741012934e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.221716e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.226773e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.226773e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.135252e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.139079e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.139079e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177152e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.045148 sec
-INFO: No Floating Point Exceptions have been reported
-       132,927,826      cycles                           #    2.743 GHz                    
-       372,156,154      instructions                     #    2.80  insn per cycle         
-       0.049041087 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:10141) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.048547 sec
+       136,061,072      cycles                           #    2.633 GHz                       
+       383,416,652      instructions                     #    2.82  insn per cycle            
+       0.052339343 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13141) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 8.127809e-06
-Avg ME (F77/C++)    = 8.1278090510674588E-006
-Relative difference = 6.2830535070193674e-09
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 8.127808e-06
+Avg ME (F77/C++)    = 8.1278084747868991E-006
+Relative difference = 5.841512231881029e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.776220e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.801025e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.801025e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.565076e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.586954e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.586954e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.021005 sec
-INFO: No Floating Point Exceptions have been reported
-        65,153,242      cycles                           #    2.690 GHz                    
-       142,838,093      instructions                     #    2.19  insn per cycle         
-       0.024771930 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9241) (512y:    0) (512z:    0)
+TOTAL       :     0.022579 sec
+        67,053,332      cycles                           #    2.610 GHz                       
+       146,052,379      instructions                     #    2.18  insn per cycle            
+       0.026232857 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9511) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
-Avg ME (F77/C++)    = 8.1275366216540664E-006
-Relative difference = 4.655111786058001e-08
+Avg ME (F77/C++)    = 8.1275365583690230E-006
+Relative difference = 5.4337614979978516e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.070417e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.098717e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.098717e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.767006e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.794756e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.794756e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.019184 sec
-INFO: No Floating Point Exceptions have been reported
-        60,296,621      cycles                           #    2.678 GHz                    
-       132,772,434      instructions                     #    2.20  insn per cycle         
-       0.023065155 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8959) (512y:   28) (512z:    0)
+TOTAL       :     0.021017 sec
+        62,203,202      cycles                           #    2.571 GHz                       
+       138,374,959      instructions                     #    2.22  insn per cycle            
+       0.024742594 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9383) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
-Avg ME (F77/C++)    = 8.1275366216540664E-006
-Relative difference = 4.655111786058001e-08
+Avg ME (F77/C++)    = 8.1275365583690230E-006
+Relative difference = 5.4337614979978516e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.324469e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.345673e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.345673e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.276511e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.298668e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.298668e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165747e-04 +- 6.542824e-04 )  GeV^-4
-TOTAL       :     0.024875 sec
-INFO: No Floating Point Exceptions have been reported
-        52,411,208      cycles                           #    1.857 GHz                    
-        79,637,147      instructions                     #    1.52  insn per cycle         
-       0.028776798 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2836) (512y:   30) (512z: 7437)
+TOTAL       :     0.025440 sec
+        52,353,208      cycles                           #    1.824 GHz                       
+        81,139,080      instructions                     #    1.55  insn per cycle            
+       0.029417169 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2361) (512y:    5) (512z: 8238)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
-Avg ME (F77/C++)    = 8.1275369863475849E-006
-Relative difference = 1.6797726498700304e-09
+Avg ME (F77/C++)    = 8.1275370319141356E-006
+Relative difference = 3.926667622210349e-09
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
index 67a7328c67..1d521f46f1 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,251 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:01:02
+DATE: 2025-09-24_09:35:51
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.235104e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.244507e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.246621e+04                 )  sec^-1
-MeanMatrixElemValue         = ( 7.188141e-04 +- 6.565202e-04 )  GeV^-4
-TOTAL       :     0.477845 sec
-INFO: No Floating Point Exceptions have been reported
-     1,997,911,903      cycles                           #    2.876 GHz                    
-     2,886,764,809      instructions                     #    1.44  insn per cycle         
-       0.753229194 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+EvtsPerSec[Rmb+ME]     (23) = ( 8.303236e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.318225e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.321223e+03                 )  sec^-1
+MeanMatrixElemValue         = ( 7.188142e-04 +- 6.565203e-04 )  GeV^-4
+TOTAL       :     0.536056 sec
+     2,212,356,220      cycles                           #    2.822 GHz                       
+     3,186,924,366      instructions                     #    1.44  insn per cycle            
+       0.842335533 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.096496e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 8.193422e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 8.206590e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 8.020496e-03 +- 4.025606e-03 )  GeV^-4
-TOTAL       :     0.477844 sec
-INFO: No Floating Point Exceptions have been reported
-     2,000,227,335      cycles                           #    2.879 GHz                    
-     2,887,661,973      instructions                     #    1.44  insn per cycle         
-       0.753759254 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 70
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 50
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 40
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127250e-06
-Avg ME (F77/GPU)   = 8.1272866419447706E-006
-Relative difference = 4.508529302013153e-06
+Avg ME (F77/GPU)   = 8.1272866414401287E-006
+Relative difference = 4.508467209435567e-06
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.435869e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.439325e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.439325e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.345209e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.348975e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.348975e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177153e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.154994 sec
-INFO: No Floating Point Exceptions have been reported
-       461,652,768      cycles                           #    2.918 GHz                    
-     1,376,807,565      instructions                     #    2.98  insn per cycle         
-       0.158786297 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2930) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.159052 sec
+       459,276,011      cycles                           #    2.840 GHz                       
+     1,376,864,775      instructions                     #    3.00  insn per cycle            
+       0.162966642 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3781) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 8.127811e-06
-Avg ME (F77/C++)    = 8.1278105271212486E-006
-Relative difference = 5.8180333155894157e-08
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 8.127810e-06
+Avg ME (F77/C++)    = 8.1278103112056928E-006
+Relative difference = 3.82889970907774e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.215601e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.220158e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.220158e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.145776e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.149797e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.149797e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.177152e-04 +- 6.554185e-04 )  GeV^-4
-TOTAL       :     0.044587 sec
-INFO: No Floating Point Exceptions have been reported
-       130,364,411      cycles                           #    2.725 GHz                    
-       367,274,419      instructions                     #    2.82  insn per cycle         
-       0.048380365 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:10124) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.047450 sec
+       133,795,452      cycles                           #    2.643 GHz                       
+       378,404,458      instructions                     #    2.83  insn per cycle            
+       0.051234729 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:13303) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
-Avg ME (C++/C++)    = 8.127809e-06
-Avg ME (F77/C++)    = 8.1278090510674588E-006
-Relative difference = 6.2830535070193674e-09
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+Avg ME (C++/C++)    = 8.127808e-06
+Avg ME (F77/C++)    = 8.1278084747868991E-006
+Relative difference = 5.841512231881029e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.799777e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.825160e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.825160e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.486627e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.506423e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.506423e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.020201 sec
-INFO: No Floating Point Exceptions have been reported
-        63,211,215      cycles                           #    2.704 GHz                    
-       138,063,768      instructions                     #    2.18  insn per cycle         
-       0.023985955 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9196) (512y:    0) (512z:    0)
+TOTAL       :     0.022522 sec
+        65,118,615      cycles                           #    2.531 GHz                       
+       141,235,296      instructions                     #    2.17  insn per cycle            
+       0.026316984 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9456) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
-Avg ME (F77/C++)    = 8.1275366216540664E-006
-Relative difference = 4.655111786058001e-08
+Avg ME (F77/C++)    = 8.1275365583690230E-006
+Relative difference = 5.4337614979978516e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.035669e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.062918e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.062918e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.749111e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.772359e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.772359e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165746e-04 +- 6.542823e-04 )  GeV^-4
-TOTAL       :     0.018625 sec
-INFO: No Floating Point Exceptions have been reported
-        57,993,332      cycles                           #    2.658 GHz                    
-       127,990,808      instructions                     #    2.21  insn per cycle         
-       0.022353301 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8910) (512y:   28) (512z:    0)
+TOTAL       :     0.020450 sec
+        59,809,769      cycles                           #    2.548 GHz                       
+       133,453,298      instructions                     #    2.23  insn per cycle            
+       0.024051724 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9326) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
-Avg ME (F77/C++)    = 8.1275366216540664E-006
-Relative difference = 4.655111786058001e-08
+Avg ME (F77/C++)    = 8.1275365583690230E-006
+Relative difference = 5.4337614979978516e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.344103e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.363443e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.363443e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.272763e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.292384e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.292384e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.165747e-04 +- 6.542824e-04 )  GeV^-4
-TOTAL       :     0.024010 sec
-INFO: No Floating Point Exceptions have been reported
-        50,268,269      cycles                           #    1.840 GHz                    
-        74,785,740      instructions                     #    1.49  insn per cycle         
-       0.027917015 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2791) (512y:   30) (512z: 7439)
+TOTAL       :     0.024694 sec
+        50,063,265      cycles                           #    1.787 GHz                       
+        76,311,691      instructions                     #    1.52  insn per cycle            
+       0.028520781 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2302) (512y:    5) (512z: 8341)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127537e-06
-Avg ME (F77/C++)    = 8.1275369863475849E-006
-Relative difference = 1.6797726498700304e-09
+Avg ME (F77/C++)    = 8.1275370319141356E-006
+Relative difference = 3.926667622210349e-09
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
index 50cf2d796e..b19d440304 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:29
+DATE: 2025-09-24_09:34:21
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.754018e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.771557e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.774637e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.249673e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.266284e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.269736e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.469482 sec
-INFO: No Floating Point Exceptions have been reported
-     1,992,256,665      cycles                           #    2.872 GHz                    
-     2,888,484,617      instructions                     #    1.45  insn per cycle         
-       0.750839241 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.536874 sec
+     2,221,747,197      cycles                           #    2.822 GHz                       
+     3,163,839,758      instructions                     #    1.42  insn per cycle            
+       0.846111338 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.962737e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.089994e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.098896e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.485052 sec
-INFO: No Floating Point Exceptions have been reported
-     2,027,704,407      cycles                           #    2.871 GHz                    
-     3,029,735,278      instructions                     #    1.49  insn per cycle         
-       0.765353713 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 104
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 76
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 95
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
-Avg ME (F77/GPU)   = 8.1274562879405200E-006
-Relative difference = 3.3369094561706885e-07
+Avg ME (F77/GPU)   = 8.1274562526495326E-006
+Relative difference = 3.380331376097252e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.401289e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.404577e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.404577e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.285769e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.288921e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.288921e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.157429 sec
-INFO: No Floating Point Exceptions have been reported
-       471,621,611      cycles                           #    2.936 GHz                    
-     1,398,387,891      instructions                     #    2.97  insn per cycle         
-       0.161191989 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3899) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.162937 sec
+       476,178,835      cycles                           #    2.866 GHz                       
+     1,394,739,218      instructions                     #    2.93  insn per cycle            
+       0.166768119 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4201) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562948736117E-006
 Relative difference = 3.32837900190667e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.729709e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.743939e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.743939e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.970683e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.990939e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.990939e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.080446 sec
-INFO: No Floating Point Exceptions have been reported
-       237,178,815      cycles                           #    2.833 GHz                    
-       688,220,781      instructions                     #    2.90  insn per cycle         
-       0.084309693 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9334) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.090415 sec
+       254,903,248      cycles                           #    2.722 GHz                       
+       712,053,573      instructions                     #    2.79  insn per cycle            
+       0.094247271 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12853) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563175290919E-006
 Relative difference = 3.3005037703909805e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.409119e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.415451e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.415451e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.321517e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.326552e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.326552e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.039507 sec
-INFO: No Floating Point Exceptions have been reported
-       114,068,471      cycles                           #    2.665 GHz                    
-       253,096,543      instructions                     #    2.22  insn per cycle         
-       0.043335126 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8363) (512y:    0) (512z:    0)
+TOTAL       :     0.042210 sec
+       118,596,276      cycles                           #    2.612 GHz                       
+       265,047,520      instructions                     #    2.23  insn per cycle            
+       0.046012448 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9126) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.680681e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.688641e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.688641e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.512441e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.518991e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.518991e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.033493 sec
-INFO: No Floating Point Exceptions have been reported
-       101,334,967      cycles                           #    2.753 GHz                    
-       233,610,113      instructions                     #    2.31  insn per cycle         
-       0.037380618 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7501) (512y:  146) (512z:    0)
+TOTAL       :     0.037109 sec
+       107,186,687      cycles                           #    2.649 GHz                       
+       250,306,884      instructions                     #    2.34  insn per cycle            
+       0.040964805 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8963) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.194656e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.199944e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.199944e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.160470e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.165386e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.165386e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.046435 sec
-INFO: No Floating Point Exceptions have been reported
-        91,210,419      cycles                           #    1.827 GHz                    
-       133,172,431      instructions                     #    1.46  insn per cycle         
-       0.050429905 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2084) (512y:  122) (512z: 6354)
+TOTAL       :     0.047844 sec
+        90,399,024      cycles                           #    1.768 GHz                       
+       138,717,973      instructions                     #    1.53  insn per cycle            
+       0.051641008 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1788) (512y:   61) (512z: 8314)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
diff --git a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
index e1fc789bed..70e3ca8e2d 100644
--- a/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_smeftggtttt_mad/log_smeftggtttt_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,248 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx'
 
-DATE: 2024-10-06_10:00:41
+DATE: 2025-09-24_09:34:51
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 2.793622e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.811451e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.814397e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.236382e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.252209e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.255241e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.470592 sec
-INFO: No Floating Point Exceptions have been reported
-     1,997,502,547      cycles                           #    2.880 GHz                    
-     2,923,476,215      instructions                     #    1.46  insn per cycle         
-       0.750818094 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 255
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.535997 sec
+     2,217,885,936      cycles                           #    2.824 GHz                       
+     3,191,898,077      instructions                     #    1.44  insn per cycle            
+       0.843021508 seconds time elapsed
 .........................................................................
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 64 256 1 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
-Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
-FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 7.055830e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.165646e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.173712e+05                 )  sec^-1
-MeanMatrixElemValue         = ( 8.048215e-03 +- 4.042405e-03 )  GeV^-4
-TOTAL       :     0.491474 sec
-INFO: No Floating Point Exceptions have been reported
-     2,044,918,526      cycles                           #    2.859 GHz                    
-     3,006,189,896      instructions                     #    1.47  insn per cycle         
-       0.774360899 seconds time elapsed
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 1 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 108
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 76
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 95
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 8.127459e-06
-Avg ME (F77/GPU)   = 8.1274562879405200E-006
-Relative difference = 3.3369094561706885e-07
+Avg ME (F77/GPU)   = 8.1274562526495326E-006
+Relative difference = 3.380331376097252e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 3.402707e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.406541e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.406541e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.263072e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.266213e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.266213e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.156609 sec
-INFO: No Floating Point Exceptions have been reported
-       468,766,259      cycles                           #    2.933 GHz                    
-     1,393,706,102      instructions                     #    2.97  insn per cycle         
-       0.160398151 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3800) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.163259 sec
+       474,783,021      cycles                           #    2.852 GHz                       
+     1,387,157,133      instructions                     #    2.92  insn per cycle            
+       0.166971512 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 4321) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274562948736117E-006
 Relative difference = 3.32837900190667e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.728046e+03                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.740604e+03                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.740604e+03                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.056539e+03                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.068459e+03                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.068459e+03                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.079699 sec
-INFO: No Floating Point Exceptions have been reported
-       235,148,851      cycles                           #    2.837 GHz                    
-       684,201,633      instructions                     #    2.91  insn per cycle         
-       0.083458032 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 9368) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.088414 sec
+       250,802,322      cycles                           #    2.738 GHz                       
+       707,025,863      instructions                     #    2.82  insn per cycle            
+       0.092101977 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:12829) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563175290919E-006
 Relative difference = 3.3005037703909805e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.447554e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.453499e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.453499e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.342626e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.347819e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.347819e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.037781 sec
-INFO: No Floating Point Exceptions have been reported
-       111,660,471      cycles                           #    2.716 GHz                    
-       248,651,696      instructions                     #    2.23  insn per cycle         
-       0.041691428 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8316) (512y:    0) (512z:    0)
+TOTAL       :     0.040651 sec
+       115,737,876      cycles                           #    2.641 GHz                       
+       260,102,580      instructions                     #    2.25  insn per cycle            
+       0.044409608 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 9072) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.634149e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.641617e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.641617e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.458481e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.464627e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.464627e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.033571 sec
-INFO: No Floating Point Exceptions have been reported
-        99,219,938      cycles                           #    2.697 GHz                    
-       229,292,514      instructions                     #    2.31  insn per cycle         
-       0.037291206 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 7452) (512y:  146) (512z:    0)
+TOTAL       :     0.037592 sec
+       106,299,204      cycles                           #    2.609 GHz                       
+       245,391,200      instructions                     #    2.31  insn per cycle            
+       0.041387780 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 8907) (512y:   45) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 1 256 2 OMP=
+Process                     = SIGMA_SMEFTSIM_TOPU3L_MWSCHEME_UFO_GG_TTXTTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.191988e+04                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.196872e+04                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.196872e+04                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.149106e+04                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.154011e+04                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.154011e+04                 )  sec^-1
 MeanMatrixElemValue         = ( 7.185537e-04 +- 6.562553e-04 )  GeV^-4
-TOTAL       :     0.045809 sec
-INFO: No Floating Point Exceptions have been reported
-        88,834,257      cycles                           #    1.806 GHz                    
-       128,615,199      instructions                     #    1.45  insn per cycle         
-       0.049747357 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2035) (512y:  122) (512z: 6355)
+TOTAL       :     0.047414 sec
+        88,430,072      cycles                           #    1.747 GHz                       
+       134,004,604      instructions                     #    1.52  insn per cycle            
+       0.051267395 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1732) (512y:   61) (512z: 8314)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 72 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 72 channels { 1 : 32, 2 : 32, 3 : 32, 4 : 32, 5 : 32, 6 : 32, 7 : 32, 8 : 32, 9 : 32, 10 : 32, 11 : 32, 12 : 32, 13 : 32, 14 : 32, 15 : 32, 16 : 32 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/P1_gg_ttxttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 8.127459e-06
 Avg ME (F77/C++)    = 8.1274563450143301E-006
 Relative difference = 3.266686019634872e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
index 107a77153b..12326e5c9f 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:58:55
+DATE: 2025-09-24_09:31:41
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.910300e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.325267e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.783205e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.500841e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.477366e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.559159e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.523085 sec
-INFO: No Floating Point Exceptions have been reported
-     2,188,593,202      cycles                           #    2.883 GHz                    
-     3,112,954,096      instructions                     #    1.42  insn per cycle         
-       0.817031478 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 130
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.549848 sec
+     2,308,392,260      cycles                           #    2.828 GHz                       
+     3,309,894,720      instructions                     #    1.43  insn per cycle            
+       0.873545190 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 50
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956172964262
-Relative difference = 2.590743366698123e-07
+Avg ME (F77/GPU)   = 0.14771956172964260
+Relative difference = 2.5907433685770594e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.066686e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.035589e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.035589e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.726949e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.681489e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.681489e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.278019 sec
-INFO: No Floating Point Exceptions have been reported
-     3,764,987,469      cycles                           #    2.931 GHz                    
-     9,752,169,319      instructions                     #    2.59  insn per cycle         
-       1.285199771 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  341) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.470755 sec
+     4,243,036,983      cycles                           #    2.875 GHz                       
+     9,962,547,463      instructions                     #    2.35  insn per cycle            
+       1.476417627 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.478889e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.890818e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.890818e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.324411e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.656950e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.656950e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.830635 sec
-INFO: No Floating Point Exceptions have been reported
-     2,356,582,684      cycles                           #    2.814 GHz                    
-     5,959,230,788      instructions                     #    2.53  insn per cycle         
-       0.838030934 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1369) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.905852 sec
+     2,485,591,801      cycles                           #    2.730 GHz                       
+     6,187,854,977      instructions                     #    2.49  insn per cycle            
+       0.911320733 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1631) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.229956e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.271002e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.271002e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.055426e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.930975e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.930975e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.594206 sec
-INFO: No Floating Point Exceptions have been reported
-     1,695,017,656      cycles                           #    2.820 GHz                    
-     3,345,002,918      instructions                     #    1.97  insn per cycle         
-       0.601755215 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1499) (512y:    0) (512z:    0)
+TOTAL       :     0.624480 sec
+     1,727,598,030      cycles                           #    2.746 GHz                       
+     3,276,460,156      instructions                     #    1.90  insn per cycle            
+       0.629818884 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1543) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.272289e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.349942e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.349942e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.134099e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.082651e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.082651e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.586413 sec
-INFO: No Floating Point Exceptions have been reported
-     1,670,913,790      cycles                           #    2.815 GHz                    
-     3,318,759,581      instructions                     #    1.99  insn per cycle         
-       0.594196558 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1375) (512y:   96) (512z:    0)
+TOTAL       :     0.605914 sec
+     1,675,699,986      cycles                           #    2.743 GHz                       
+     3,304,943,330      instructions                     #    1.97  insn per cycle            
+       0.611525442 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1527) (512y:   17) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.146635e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.068698e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.068698e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.946458e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.706761e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.706761e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.618076 sec
-INFO: No Floating Point Exceptions have been reported
-     1,426,424,228      cycles                           #    2.279 GHz                    
-     2,470,718,173      instructions                     #    1.73  insn per cycle         
-       0.626622796 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  580) (512y:   60) (512z: 1021)
+TOTAL       :     0.653065 sec
+     1,417,818,298      cycles                           #    2.156 GHz                       
+     2,345,380,094      instructions                     #    1.65  insn per cycle            
+       0.658218910 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  642) (512y:   25) (512z: 1138)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
index 00276091a3..a869e443d5 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:08
+DATE: 2025-09-24_09:32:00
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 5.969963e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.427733e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.936447e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.501741e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.485765e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.567818e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.519852 sec
-INFO: No Floating Point Exceptions have been reported
-     2,172,307,830      cycles                           #    2.872 GHz                    
-     3,081,950,905      instructions                     #    1.42  insn per cycle         
-       0.813507263 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.551341 sec
+     2,310,675,807      cycles                           #    2.827 GHz                       
+     3,319,532,296      instructions                     #    1.44  insn per cycle            
+       0.875186841 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 50
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956172964262
-Relative difference = 2.590743366698123e-07
+Avg ME (F77/GPU)   = 0.14771956172964254
+Relative difference = 2.5907433723349327e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.156288e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.045734e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.045734e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.779674e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.732552e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.732552e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.265578 sec
-INFO: No Floating Point Exceptions have been reported
-     3,747,828,201      cycles                           #    2.946 GHz                    
-     9,632,221,913      instructions                     #    2.57  insn per cycle         
-       1.272810702 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  359) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.460825 sec
+     4,199,668,829      cycles                           #    2.865 GHz                       
+     9,806,765,322      instructions                     #    2.34  insn per cycle            
+       1.466388181 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  452) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.494739e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.931280e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.931280e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.412344e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.799560e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.799560e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.827695 sec
-INFO: No Floating Point Exceptions have been reported
-     2,378,817,913      cycles                           #    2.850 GHz                    
-     5,912,991,474      instructions                     #    2.49  insn per cycle         
-       0.835517705 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1340) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.855928 sec
+     2,469,646,946      cycles                           #    2.870 GHz                       
+     6,091,622,150      instructions                     #    2.47  insn per cycle            
+       0.861313869 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1603) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.079942e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.957305e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.957305e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.050597e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.923483e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.923483e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.628333 sec
-INFO: No Floating Point Exceptions have been reported
-     1,788,933,654      cycles                           #    2.817 GHz                    
-     3,328,376,953      instructions                     #    1.86  insn per cycle         
-       0.635862534 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1436) (512y:    0) (512z:    0)
+TOTAL       :     0.624282 sec
+     1,719,726,497      cycles                           #    2.734 GHz                       
+     3,226,039,810      instructions                     #    1.88  insn per cycle            
+       0.629686248 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1460) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.320640e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.437091e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.437091e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.198399e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.213116e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.213116e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.574324 sec
-INFO: No Floating Point Exceptions have been reported
-     1,653,934,067      cycles                           #    2.845 GHz                    
-     3,291,054,827      instructions                     #    1.99  insn per cycle         
-       0.581926884 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1328) (512y:   96) (512z:    0)
+TOTAL       :     0.590149 sec
+     1,638,224,463      cycles                           #    2.753 GHz                       
+     3,258,304,993      instructions                     #    1.99  insn per cycle            
+       0.595665429 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1446) (512y:   17) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.152026e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.087565e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.087565e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.944452e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.700551e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.700551e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.611501 sec
-INFO: No Floating Point Exceptions have been reported
-     1,420,414,146      cycles                           #    2.296 GHz                    
-     2,439,626,449      instructions                     #    1.72  insn per cycle         
-       0.619276325 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  547) (512y:   60) (512z: 1007)
+TOTAL       :     0.653709 sec
+     1,422,880,012      cycles                           #    2.163 GHz                       
+     2,319,628,542      instructions                     #    1.63  insn per cycle            
+       0.659181710 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  585) (512y:   25) (512z: 1114)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956172964268
 Relative difference = 2.59074336294025e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
index bd2093b69b..8955a0ec97 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:44
+DATE: 2025-09-24_09:32:51
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.032605e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.087100e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.501992e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.658644e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.361450e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.502552e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486732e-01 +- 3.293572e-05 )  GeV^0
-TOTAL       :     0.487961 sec
-INFO: No Floating Point Exceptions have been reported
-     2,048,884,733      cycles                           #    2.866 GHz                    
-     2,915,076,407      instructions                     #    1.42  insn per cycle         
-       0.773529382 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 97
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.500309 sec
+     2,141,761,893      cycles                           #    2.827 GHz                       
+     3,051,585,121      instructions                     #    1.42  insn per cycle            
+       0.815376304 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 32
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 64
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477195e-01
-Avg ME (F77/GPU)   = 0.14771956735057756
-Relative difference = 4.559355911674916e-07
+Avg ME (F77/GPU)   = 0.14771956775803119
+Relative difference = 4.586938839209484e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.070270e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.039772e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.039772e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.837047e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.845359e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.845359e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     1.246364 sec
-INFO: No Floating Point Exceptions have been reported
-     3,688,263,957      cycles                           #    2.948 GHz                    
-     9,604,598,454      instructions                     #    2.60  insn per cycle         
-       1.251819600 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  463) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.429486 sec
+     4,115,124,509      cycles                           #    2.870 GHz                       
+     9,856,209,851      instructions                     #    2.40  insn per cycle            
+       1.434721281 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  520) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956094773486
 Relative difference = 2.643675256627469e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.214709e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.338045e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.338045e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.915212e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.739938e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.739938e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     0.563106 sec
-INFO: No Floating Point Exceptions have been reported
-     1,636,975,072      cycles                           #    2.881 GHz                    
-     3,967,404,939      instructions                     #    2.42  insn per cycle         
-       0.568812477 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1579) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.638047 sec
+     1,798,711,437      cycles                           #    2.801 GHz                       
+     4,006,865,720      instructions                     #    2.23  insn per cycle            
+       0.643227605 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1845) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955861942843
 Relative difference = 2.80129187869649e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.994371e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.295152e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.295152e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.020131e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.370178e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.370178e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.439594 sec
-INFO: No Floating Point Exceptions have been reported
-     1,256,321,725      cycles                           #    2.826 GHz                    
-     2,497,438,777      instructions                     #    1.99  insn per cycle         
-       0.445252542 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1924) (512y:    0) (512z:    0)
+TOTAL       :     0.437832 sec
+     1,215,377,102      cycles                           #    2.751 GHz                       
+     2,328,338,302      instructions                     #    1.92  insn per cycle            
+       0.443031597 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1759) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.098864e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.632832e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.632832e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.122881e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.707659e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.707659e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.427898 sec
-INFO: No Floating Point Exceptions have been reported
-     1,236,536,318      cycles                           #    2.855 GHz                    
-     2,473,365,360      instructions                     #    2.00  insn per cycle         
-       0.433705293 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1870) (512y:    1) (512z:    0)
+TOTAL       :     0.425874 sec
+     1,193,251,740      cycles                           #    2.773 GHz                       
+     2,344,495,568      instructions                     #    1.96  insn per cycle            
+       0.430964763 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1766) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.931142e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.994223e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.994223e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.050003e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.443303e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.443303e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293561e-05 )  GeV^0
-TOTAL       :     0.448530 sec
-INFO: No Floating Point Exceptions have been reported
-     1,079,279,667      cycles                           #    2.379 GHz                    
-     2,073,684,661      instructions                     #    1.92  insn per cycle         
-       0.454351959 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1011) (512y:    5) (512z: 1292)
+TOTAL       :     0.434984 sec
+     1,026,532,065      cycles                           #    2.334 GHz                       
+     1,868,649,986      instructions                     #    1.82  insn per cycle            
+       0.440356753 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  828) (512y:    5) (512z: 1248)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955262403935
 Relative difference = 3.207154680524219e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
index 2473496911..807ed3ccfb 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:56
+DATE: 2025-09-24_09:33:06
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.032625e+08                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.129649e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.575777e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.684124e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 6.395383e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 6.536783e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486732e-01 +- 3.293572e-05 )  GeV^0
-TOTAL       :     0.481858 sec
-INFO: No Floating Point Exceptions have been reported
-     2,051,512,664      cycles                           #    2.885 GHz                    
-     2,948,723,179      instructions                     #    1.44  insn per cycle         
-       0.768027645 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 86
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.504984 sec
+     2,143,393,368      cycles                           #    2.808 GHz                       
+     3,032,081,133      instructions                     #    1.41  insn per cycle            
+       0.820426832 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 32
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 64
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477195e-01
-Avg ME (F77/GPU)   = 0.14771956525510177
-Relative difference = 4.4175008557828484e-07
+Avg ME (F77/GPU)   = 0.14771956496406347
+Relative difference = 4.3977987646867276e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.212337e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.061006e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.061006e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.901470e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.947681e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.947681e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     1.227579 sec
-INFO: No Floating Point Exceptions have been reported
-     3,620,291,769      cycles                           #    2.937 GHz                    
-     9,471,544,557      instructions                     #    2.62  insn per cycle         
-       1.233302650 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  367) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.416669 sec
+     4,065,194,569      cycles                           #    2.861 GHz                       
+     9,713,756,382      instructions                     #    2.39  insn per cycle            
+       1.421763510 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  444) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956094773486
 Relative difference = 2.643675256627469e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.220343e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.350531e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.350531e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.788185e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.503348e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.503348e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293563e-05 )  GeV^0
-TOTAL       :     0.560958 sec
-INFO: No Floating Point Exceptions have been reported
-     1,637,220,191      cycles                           #    2.892 GHz                    
-     3,933,324,289      instructions                     #    2.40  insn per cycle         
-       0.566799529 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1517) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.677027 sec
+     1,794,955,620      cycles                           #    2.640 GHz                       
+     3,955,098,557      instructions                     #    2.20  insn per cycle            
+       0.682659938 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1794) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955861942843
 Relative difference = 2.80129187869649e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.995950e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.312007e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.312007e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.983089e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.375375e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.375375e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.438140 sec
-INFO: No Floating Point Exceptions have been reported
-     1,255,613,659      cycles                           #    2.833 GHz                    
-     2,482,092,959      instructions                     #    1.98  insn per cycle         
-       0.443764126 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1817) (512y:    0) (512z:    0)
+TOTAL       :     0.442243 sec
+     1,221,294,346      cycles                           #    2.733 GHz                       
+     2,307,350,404      instructions                     #    1.89  insn per cycle            
+       0.447543585 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1664) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.087645e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.599722e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.599722e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.117002e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.716134e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.716134e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293562e-05 )  GeV^0
-TOTAL       :     0.428178 sec
-INFO: No Floating Point Exceptions have been reported
-     1,231,320,501      cycles                           #    2.843 GHz                    
-     2,457,271,461      instructions                     #    2.00  insn per cycle         
-       0.433769891 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1773) (512y:    1) (512z:    0)
+TOTAL       :     0.426003 sec
+     1,190,116,896      cycles                           #    2.764 GHz                       
+     2,327,165,551      instructions                     #    1.96  insn per cycle            
+       0.431219526 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1677) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955698961392
 Relative difference = 2.9116235141448046e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.945345e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.024652e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.024652e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.092007e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.582843e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.582843e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486735e-01 +- 3.293561e-05 )  GeV^0
-TOTAL       :     0.444653 sec
-INFO: No Floating Point Exceptions have been reported
-     1,073,447,692      cycles                           #    2.387 GHz                    
-     2,057,517,401      instructions                     #    1.92  insn per cycle         
-       0.450271011 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  906) (512y:    5) (512z: 1273)
+TOTAL       :     0.428488 sec
+     1,017,961,260      cycles                           #    2.351 GHz                       
+     1,851,270,904      instructions                     #    1.82  insn per cycle            
+       0.433719442 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  724) (512y:    5) (512z: 1232)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771955262403935
 Relative difference = 3.207154680524219e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
index 5ae4907c26..58754172fd 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:20
+DATE: 2025-09-24_09:32:19
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.059495e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.307970e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.770458e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.505328e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.491174e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.573966e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.521991 sec
-INFO: No Floating Point Exceptions have been reported
-     2,182,804,723      cycles                           #    2.882 GHz                    
-     3,091,712,352      instructions                     #    1.42  insn per cycle         
-       0.814546737 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 130
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.550390 sec
+     2,310,532,668      cycles                           #    2.826 GHz                       
+     3,294,016,749      instructions                     #    1.43  insn per cycle            
+       0.874757748 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 50
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956187351573
-Relative difference = 2.5810037581511336e-07
+Avg ME (F77/GPU)   = 0.14771956611891737
+Relative difference = 2.2936077970637e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 9.006175e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.025890e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.025890e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.641873e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.554824e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.554824e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.286472 sec
-INFO: No Floating Point Exceptions have been reported
-     3,808,533,169      cycles                           #    2.945 GHz                    
-     9,779,238,528      instructions                     #    2.57  insn per cycle         
-       1.294044616 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  341) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.485790 sec
+     4,277,749,857      cycles                           #    2.870 GHz                       
+     9,988,233,894      instructions                     #    2.33  insn per cycle            
+       1.491506902 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  421) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.477969e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.892042e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.892042e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.413178e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.797545e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.797545e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.834785 sec
-INFO: No Floating Point Exceptions have been reported
-     2,360,159,801      cycles                           #    2.803 GHz                    
-     5,954,715,990      instructions                     #    2.52  insn per cycle         
-       0.842708021 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1412) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.856045 sec
+     2,411,968,983      cycles                           #    2.802 GHz                       
+     6,079,988,422      instructions                     #    2.52  insn per cycle            
+       0.861716549 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1674) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.260391e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.350498e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.350498e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.119632e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.062946e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.062946e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.587860 sec
-INFO: No Floating Point Exceptions have been reported
-     1,670,861,769      cycles                           #    2.810 GHz                    
-     3,283,918,691      instructions                     #    1.97  insn per cycle         
-       0.595426943 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1567) (512y:    0) (512z:    0)
+TOTAL       :     0.608749 sec
+     1,686,930,401      cycles                           #    2.750 GHz                       
+     3,226,597,852      instructions                     #    1.91  insn per cycle            
+       0.614233677 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1612) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.348300e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.498815e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.498815e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.188784e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.193624e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.193624e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.573278 sec
-INFO: No Floating Point Exceptions have been reported
-     1,645,784,221      cycles                           #    2.835 GHz                    
-     3,247,832,958      instructions                     #    1.97  insn per cycle         
-       0.581347619 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1446) (512y:  101) (512z:    0)
+TOTAL       :     0.592651 sec
+     1,642,010,663      cycles                           #    2.749 GHz                       
+     3,252,569,969      instructions                     #    1.98  insn per cycle            
+       0.598011332 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1602) (512y:   19) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.143317e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.068862e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.068862e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.986612e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.779143e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.779143e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.613179 sec
-INFO: No Floating Point Exceptions have been reported
-     1,394,199,360      cycles                           #    2.248 GHz                    
-     2,406,597,613      instructions                     #    1.73  insn per cycle         
-       0.620673412 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  768) (512y:   64) (512z: 1063)
+TOTAL       :     0.642604 sec
+     1,399,424,953      cycles                           #    2.162 GHz                       
+     2,321,995,232      instructions                     #    1.66  insn per cycle            
+       0.648127480 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  787) (512y:   29) (512z: 1168)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
diff --git a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
index 3e507cd882..63db15facd 100644
--- a/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggt1t1_mad/log_susyggt1t1_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x'
 
-DATE: 2024-10-06_09:59:32
+DATE: 2025-09-24_09:32:35
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 6.080757e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.449829e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.987143e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.501613e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.478366e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.559919e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.520335 sec
-INFO: No Floating Point Exceptions have been reported
-     2,182,231,478      cycles                           #    2.885 GHz                    
-     3,097,447,003      instructions                     #    1.42  insn per cycle         
-       0.813407395 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.551435 sec
+     2,306,006,273      cycles                           #    2.825 GHz                       
+     3,311,408,318      instructions                     #    1.44  insn per cycle            
+       0.874133523 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 50
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 94
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 1.477196e-01
-Avg ME (F77/GPU)   = 0.14771956187351573
-Relative difference = 2.5810037581511336e-07
+Avg ME (F77/GPU)   = 0.14771956611891737
+Relative difference = 2.2936077970637e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.967180e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.023779e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.023779e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 7.719475e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 8.651618e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 8.651618e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     1.289771 sec
-INFO: No Floating Point Exceptions have been reported
-     3,794,201,935      cycles                           #    2.927 GHz                    
-     9,666,542,351      instructions                     #    2.55  insn per cycle         
-       1.297077628 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  359) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     1.471775 sec
+     4,234,745,657      cycles                           #    2.868 GHz                       
+     9,831,904,342      instructions                     #    2.32  insn per cycle            
+       1.477406786 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  452) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 1.583493e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.064503e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.064503e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.374919e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.737278e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.737278e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.784715 sec
-INFO: No Floating Point Exceptions have been reported
-     2,328,374,642      cycles                           #    2.942 GHz                    
-     5,878,440,022      instructions                     #    2.52  insn per cycle         
-       0.792155161 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 1371) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     0.876478 sec
+     2,395,806,213      cycles                           #    2.719 GHz                       
+     5,984,137,010      instructions                     #    2.50  insn per cycle            
+       0.882136028 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 1646) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956645541506
 Relative difference = 2.270828308707201e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.254464e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.329047e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.329047e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.126066e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.082964e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.082964e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.590226 sec
-INFO: No Floating Point Exceptions have been reported
-     1,689,754,472      cycles                           #    2.827 GHz                    
-     3,255,343,739      instructions                     #    1.93  insn per cycle         
-       0.598325338 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1483) (512y:    0) (512z:    0)
+TOTAL       :     0.606254 sec
+     1,679,878,737      cycles                           #    2.750 GHz                       
+     3,176,290,390      instructions                     #    1.89  insn per cycle            
+       0.611543989 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1529) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.345727e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.502859e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.502859e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.232873e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.291969e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.291969e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.572126 sec
-INFO: No Floating Point Exceptions have been reported
-     1,634,040,486      cycles                           #    2.820 GHz                    
-     3,219,951,921      instructions                     #    1.97  insn per cycle         
-       0.580193189 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1382) (512y:  101) (512z:    0)
+TOTAL       :     0.582901 sec
+     1,615,766,448      cycles                           #    2.750 GHz                       
+     3,205,698,167      instructions                     #    1.98  insn per cycle            
+       0.588288106 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1521) (512y:   19) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_T1T1X_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 2.168828e+06                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.118471e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.118471e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.013041e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.827900e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.827900e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 1.486736e-01 +- 3.293564e-05 )  GeV^0
-TOTAL       :     0.609357 sec
-INFO: No Floating Point Exceptions have been reported
-     1,417,478,840      cycles                           #    2.299 GHz                    
-     2,399,490,515      instructions                     #    1.69  insn per cycle         
-       0.617376810 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  716) (512y:   64) (512z: 1056)
+TOTAL       :     0.634290 sec
+     1,400,699,809      cycles                           #    2.193 GHz                       
+     2,295,187,551      instructions                     #    1.64  insn per cycle            
+       0.639552302 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2:  730) (512y:   29) (512z: 1144)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 6 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 6 channels { 2 : 128, 3 : 96, 4 : 96, 5 : 96, 6 : 96 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/P1_gg_t1t1x/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 1.477196e-01
 Avg ME (F77/C++)    = 0.14771956674392650
 Relative difference = 2.2512972893324335e-07
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
index 607647c622..aeaac499d3 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:56:33
+DATE: 2025-09-24_09:28:31
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.270000e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.214418e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.893995e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.399782e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.051193e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.058591e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.542175 sec
-INFO: No Floating Point Exceptions have been reported
-     2,178,993,269      cycles                           #    2.803 GHz                    
-     3,108,059,533      instructions                     #    1.43  insn per cycle         
-       0.838052893 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.626877 sec
+     2,749,226,499      cycles                           #    2.827 GHz                       
+     4,334,745,620      instructions                     #    1.58  insn per cycle            
+       1.033290843 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
 Avg ME (F77/GPU)   = 2.0158358666195562
 Relative difference = 6.616631711254798e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.830273e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.876984e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.876984e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.751455e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.795224e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.795224e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.836443 sec
-INFO: No Floating Point Exceptions have been reported
-    17,247,101,824      cycles                           #    2.952 GHz                    
-    45,921,478,129      instructions                     #    2.66  insn per cycle         
-       5.842453521 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  622) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.095203 sec
+    17,539,316,031      cycles                           #    2.876 GHz                       
+    47,145,637,455      instructions                     #    2.69  insn per cycle            
+       6.100855118 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  697) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194407
 Relative difference = 6.616637439061751e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.179372e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.338539e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.338539e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.939396e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.076198e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.076198e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.409251 sec
-INFO: No Floating Point Exceptions have been reported
-    10,038,815,546      cycles                           #    2.940 GHz                    
-    27,809,165,185      instructions                     #    2.77  insn per cycle         
-       3.415697404 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2537) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.676656 sec
+    10,572,965,063      cycles                           #    2.872 GHz                       
+    29,309,599,190      instructions                     #    2.77  insn per cycle            
+       3.682010885 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2912) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194411
 Relative difference = 6.616637417031725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.016017e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.397611e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.397611e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.757789e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.113659e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.113659e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.202025 sec
-INFO: No Floating Point Exceptions have been reported
-     6,083,216,423      cycles                           #    2.757 GHz                    
-    12,595,496,799      instructions                     #    2.07  insn per cycle         
-       2.208459235 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2620) (512y:    0) (512z:    0)
+TOTAL       :     2.312987 sec
+     6,232,025,172      cycles                           #    2.689 GHz                       
+    12,530,852,292      instructions                     #    2.01  insn per cycle            
+       2.318453338 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2739) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.491994e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.947919e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.947919e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.041839e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.435080e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.435080e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.018419 sec
-INFO: No Floating Point Exceptions have been reported
-     5,588,215,007      cycles                           #    2.761 GHz                    
-    12,004,808,489      instructions                     #    2.15  insn per cycle         
-       2.024606102 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2365) (512y:  144) (512z:    0)
+TOTAL       :     2.187506 sec
+     5,882,467,926      cycles                           #    2.684 GHz                       
+    12,196,990,510      instructions                     #    2.07  insn per cycle            
+       2.192967820 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2591) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.529303e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.713663e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.713663e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.310937e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.475786e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.475786e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.082414 sec
-INFO: No Floating Point Exceptions have been reported
-     5,763,724,377      cycles                           #    1.867 GHz                    
-     8,350,228,242      instructions                     #    1.45  insn per cycle         
-       3.088879573 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1468) (512y:  122) (512z: 1806)
+TOTAL       :     3.275794 sec
+     5,837,040,134      cycles                           #    1.780 GHz                       
+     7,907,522,148      instructions                     #    1.35  insn per cycle            
+       3.281439473 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1517) (512y:   65) (512z: 1907)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
index 3ed4c3c5ff..5acef4ec65 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_d_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:56:58
+DATE: 2025-09-24_09:29:07
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.306886e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.297289e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.977845e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.365587e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.046816e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.054123e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.530204 sec
-INFO: No Floating Point Exceptions have been reported
-     2,211,323,980      cycles                           #    2.884 GHz                    
-     3,201,430,578      instructions                     #    1.45  insn per cycle         
-       0.823926524 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.624519 sec
+     2,779,082,474      cycles                           #    2.836 GHz                       
+     4,433,471,999      instructions                     #    1.60  insn per cycle            
+       1.037558245 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 98
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 28
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_d_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
 Avg ME (F77/GPU)   = 2.0158358666195562
 Relative difference = 6.616631711254798e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_d_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.872475e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.921622e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.921622e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.807758e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.855108e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.855108e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.706755 sec
-INFO: No Floating Point Exceptions have been reported
-    16,797,600,798      cycles                           #    2.941 GHz                    
-    44,912,592,336      instructions                     #    2.67  insn per cycle         
-       5.712473159 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  566) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.908887 sec
+    16,978,475,167      cycles                           #    2.872 GHz                       
+    45,996,489,553      instructions                     #    2.71  insn per cycle            
+       5.914594704 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  634) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194411
 Relative difference = 6.616637417031725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.376254e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.552215e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.552215e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.961290e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.101064e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.101064e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.215396 sec
-INFO: No Floating Point Exceptions have been reported
-     9,523,990,060      cycles                           #    2.957 GHz                    
-    26,686,144,259      instructions                     #    2.80  insn per cycle         
-       3.221864250 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2326) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.650084 sec
+    10,495,614,562      cycles                           #    2.872 GHz                       
+    29,292,900,438      instructions                     #    2.79  insn per cycle            
+       3.655616235 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2906) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194411
 Relative difference = 6.616637417031725e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.628485e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.953785e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.953785e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.761682e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.118086e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.118086e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.376456 sec
-INFO: No Floating Point Exceptions have been reported
-     6,603,885,103      cycles                           #    2.772 GHz                    
-    14,117,515,687      instructions                     #    2.14  insn per cycle         
-       2.382952116 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2705) (512y:    0) (512z:    0)
+TOTAL       :     2.310179 sec
+     6,236,808,714      cycles                           #    2.695 GHz                       
+    12,519,643,661      instructions                     #    2.01  insn per cycle            
+       2.315615614 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2723) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.799064e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.148539e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.148539e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.065779e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.461598e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.461598e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.297050 sec
-INFO: No Floating Point Exceptions have been reported
-     6,386,723,525      cycles                           #    2.773 GHz                    
-    13,726,619,432      instructions                     #    2.15  insn per cycle         
-       2.304339219 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2356) (512y:  298) (512z:    0)
+TOTAL       :     2.177250 sec
+     5,873,454,407      cycles                           #    2.692 GHz                       
+    12,187,790,357      instructions                     #    2.08  insn per cycle            
+       2.182588943 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2573) (512y:   47) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.339110e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.504311e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.504311e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.317632e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.482853e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.482853e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.254444 sec
-INFO: No Floating Point Exceptions have been reported
-     5,974,020,045      cycles                           #    1.833 GHz                    
-    10,122,964,274      instructions                     #    1.69  insn per cycle         
-       3.261538649 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1318) (512y:  208) (512z: 1986)
+TOTAL       :     3.267572 sec
+     5,832,481,864      cycles                           #    1.783 GHz                       
+     7,898,618,772      instructions                     #    1.35  insn per cycle            
+       3.273005255 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1503) (512y:   65) (512z: 1903)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158358666194953
 Relative difference = 6.616634729368461e-08
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
index 7bd4c9bca6..c3f3d78cd2 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:58:13
+DATE: 2025-09-24_09:30:46
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.178914e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.740854e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.866078e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.915796e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.152410e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.168379e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072877e+00 +- 3.361153e-03 )  GeV^0
-TOTAL       :     0.492488 sec
-INFO: No Floating Point Exceptions have been reported
-     2,067,407,730      cycles                           #    2.879 GHz                    
-     2,921,575,837      instructions                     #    1.41  insn per cycle         
-       0.777094459 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 125
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.536916 sec
+     2,311,303,351      cycles                           #    2.820 GHz                       
+     3,451,933,380      instructions                     #    1.49  insn per cycle            
+       0.876584729 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015841e+00
-Avg ME (F77/GPU)   = 2.0158787037944421
-Relative difference = 1.870375413642407e-05
+Avg ME (F77/GPU)   = 2.0158787176478654
+Relative difference = 1.8710626416180963e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.933137e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.988210e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.988210e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.826941e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.876900e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.876900e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361545e-03 )  GeV^0
-TOTAL       :     5.510560 sec
-INFO: No Floating Point Exceptions have been reported
-    16,216,363,781      cycles                           #    2.940 GHz                    
-    45,321,064,348      instructions                     #    2.79  insn per cycle         
-       5.516237540 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  600) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.827244 sec
+    16,745,287,986      cycles                           #    2.872 GHz                       
+    46,779,095,008      instructions                     #    2.79  insn per cycle            
+       5.832669438 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  669) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
-Avg ME (F77/C++)    = 2.0158491701586172
-Relative difference = 8.441039850630506e-08
+Avg ME (F77/C++)    = 2.0158491693437099
+Relative difference = 8.400614836019365e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.554782e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.893509e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.893509e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.099688e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.379802e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.379802e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361544e-03 )  GeV^0
-TOTAL       :     2.389253 sec
-INFO: No Floating Point Exceptions have been reported
-     7,056,712,623      cycles                           #    2.947 GHz                    
-    17,792,064,584      instructions                     #    2.52  insn per cycle         
-       2.395009745 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 3147) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.646629 sec
+     7,621,153,372      cycles                           #    2.875 GHz                       
+    18,459,205,647      instructions                     #    2.42  insn per cycle            
+       2.651770005 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3490) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
-Avg ME (F77/C++)    = 2.0158486895961687
-Relative difference = 1.539816876576819e-07
+Avg ME (F77/C++)    = 2.0158486870350316
+Relative difference = 1.5525218811688918e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.351394e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.496890e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.496890e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.529214e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.733909e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.733909e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.343765 sec
-INFO: No Floating Point Exceptions have been reported
-     3,745,450,403      cycles                           #    2.777 GHz                    
-     8,262,540,860      instructions                     #    2.21  insn per cycle         
-       1.349671424 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3371) (512y:    0) (512z:    0)
+TOTAL       :     1.317398 sec
+     3,571,029,960      cycles                           #    2.701 GHz                       
+     7,635,561,961      instructions                     #    2.14  insn per cycle            
+       1.322807202 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3265) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
-Avg ME (F77/C++)    = 2.0158474864438176
-Relative difference = 2.4130988992271984e-07
+Avg ME (F77/C++)    = 2.0158474886557087
+Relative difference = 2.4240714140793267e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 8.821818e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.011140e+06                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.011140e+06                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.907154e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.022942e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.022942e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.275053 sec
-INFO: No Floating Point Exceptions have been reported
-     3,558,622,083      cycles                           #    2.780 GHz                    
-     7,915,407,710      instructions                     #    2.22  insn per cycle         
-       1.280856743 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3214) (512y:   20) (512z:    0)
+TOTAL       :     1.264760 sec
+     3,427,937,601      cycles                           #    2.701 GHz                       
+     7,479,564,971      instructions                     #    2.18  insn per cycle            
+       1.270093065 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3174) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
-Avg ME (F77/C++)    = 2.0158474864438176
-Relative difference = 2.4130988992271984e-07
+Avg ME (F77/C++)    = 2.0158474886557087
+Relative difference = 2.4240714140793267e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.584138e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 7.256759e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 7.256759e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.772456e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.505125e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.505125e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.679646 sec
-INFO: No Floating Point Exceptions have been reported
-     3,255,689,642      cycles                           #    1.933 GHz                    
-     6,101,216,288      instructions                     #    1.87  insn per cycle         
-       1.685383243 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2258) (512y:   22) (512z: 2156)
+TOTAL       :     1.637116 sec
+     3,057,489,541      cycles                           #    1.863 GHz                       
+     5,257,903,588      instructions                     #    1.72  insn per cycle            
+       1.642609723 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2082) (512y:    5) (512z: 2093)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015848e+00
-Avg ME (F77/C++)    = 2.0158476348733529
-Relative difference = 1.8112806478434436e-07
+Avg ME (F77/C++)    = 2.0158476332435384
+Relative difference = 1.8193656547763924e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
index bd2def4f48..04242d0363 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_f_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,236 +10,215 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:58:33
+DATE: 2025-09-24_09:31:13
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 8.136229e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.747823e+08                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.880709e+08                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.924572e+07                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 2.163960e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 2.180163e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072877e+00 +- 3.361153e-03 )  GeV^0
-TOTAL       :     0.488528 sec
-INFO: No Floating Point Exceptions have been reported
-     2,057,813,122      cycles                           #    2.874 GHz                    
-     2,903,563,490      instructions                     #    1.41  insn per cycle         
-       0.774040886 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 124
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.531938 sec
+     2,300,025,009      cycles                           #    2.829 GHz                       
+     3,436,350,240      instructions                     #    1.49  insn per cycle            
+       0.870280136 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 68
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 48
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 20
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_f_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015841e+00
-Avg ME (F77/GPU)   = 2.0158787037944421
-Relative difference = 1.870375413642407e-05
+Avg ME (F77/GPU)   = 2.0158787176478654
+Relative difference = 1.8710626416180963e-05
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_f_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.970300e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 2.026987e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 2.026987e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.856598e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.908262e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.908262e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361545e-03 )  GeV^0
-TOTAL       :     5.407589 sec
-INFO: No Floating Point Exceptions have been reported
-    15,991,185,925      cycles                           #    2.955 GHz                    
-    44,429,993,623      instructions                     #    2.78  insn per cycle         
-       5.412895968 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  533) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.734798 sec
+    16,473,045,879      cycles                           #    2.870 GHz                       
+    45,850,587,770      instructions                     #    2.78  insn per cycle            
+       5.740170772 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  602) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
-Avg ME (F77/C++)    = 2.0158491701586172
-Relative difference = 8.441039850630506e-08
+Avg ME (F77/C++)    = 2.0158491693437099
+Relative difference = 8.400614836019365e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.328908e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.798682e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.798682e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.096624e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 4.376244e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 4.376244e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072937e+00 +- 3.361544e-03 )  GeV^0
-TOTAL       :     2.053409 sec
-INFO: No Floating Point Exceptions have been reported
-     6,061,427,520      cycles                           #    2.945 GHz                    
-    17,076,312,832      instructions                     #    2.82  insn per cycle         
-       2.059026016 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2862) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     2.647878 sec
+     7,614,623,600      cycles                           #    2.871 GHz                       
+    18,447,507,149      instructions                     #    2.42  insn per cycle            
+       2.653267513 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 3483) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015849e+00
-Avg ME (F77/C++)    = 2.0158486895961687
-Relative difference = 1.539816876576819e-07
+Avg ME (F77/C++)    = 2.0158486870350316
+Relative difference = 1.5525218811688918e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.019252e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.594125e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.594125e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.522002e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 9.728728e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 9.728728e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.827330 sec
-INFO: No Floating Point Exceptions have been reported
-     5,036,041,688      cycles                           #    2.749 GHz                    
-    10,223,391,747      instructions                     #    2.03  insn per cycle         
-       1.833165934 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3906) (512y:    0) (512z:    0)
+TOTAL       :     1.317072 sec
+     3,572,319,102      cycles                           #    2.703 GHz                       
+     7,627,371,635      instructions                     #    2.14  insn per cycle            
+       1.322524895 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3242) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
-Avg ME (F77/C++)    = 2.0158474864438176
-Relative difference = 2.4130988992271984e-07
+Avg ME (F77/C++)    = 2.0158474886557087
+Relative difference = 2.4240714140793267e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 6.156943e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.756865e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.756865e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 8.915944e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.023940e+06                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.023940e+06                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     1.789449 sec
-INFO: No Floating Point Exceptions have been reported
-     4,972,642,094      cycles                           #    2.772 GHz                    
-     9,995,367,434      instructions                     #    2.01  insn per cycle         
-       1.795052964 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3805) (512y:    2) (512z:    0)
+TOTAL       :     1.262360 sec
+     3,416,193,686      cycles                           #    2.696 GHz                       
+     7,470,998,329      instructions                     #    2.19  insn per cycle            
+       1.267574585 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 3149) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015847e+00
-Avg ME (F77/C++)    = 2.0158474864438176
-Relative difference = 2.4130988992271984e-07
+Avg ME (F77/C++)    = 2.0158474886557087
+Relative difference = 2.4240714140793267e-07
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = FLOAT (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.670992e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.000057e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.000057e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 6.765679e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 7.496653e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 7.496653e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072967e+00 +- 3.361967e-03 )  GeV^0
-TOTAL       :     2.331763 sec
-INFO: No Floating Point Exceptions have been reported
-     4,369,500,962      cycles                           #    1.870 GHz                    
-     8,444,287,674      instructions                     #    1.93  insn per cycle         
-       2.337616992 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2744) (512y:    4) (512z: 2754)
+TOTAL       :     1.637363 sec
+     3,050,184,653      cycles                           #    1.858 GHz                       
+     5,250,274,198      instructions                     #    1.72  insn per cycle            
+       1.642488720 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2054) (512y:    5) (512z: 2092)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015848e+00
-Avg ME (F77/C++)    = 2.0158476348733529
-Relative difference = 1.8112806478434436e-07
+Avg ME (F77/C++)    = 2.0158476332435384
+Relative difference = 1.8193656547763924e-07
 OK (relative difference <= 5E-3)
 =========================================================================
 
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
index 9029ad668b..7c64f69cf6 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd0.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:57:23
+DATE: 2025-09-24_09:29:40
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.278122e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.299718e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.972605e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.407003e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.051625e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.058988e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.535533 sec
-INFO: No Floating Point Exceptions have been reported
-     2,218,013,615      cycles                           #    2.871 GHz                    
-     3,167,587,965      instructions                     #    1.43  insn per cycle         
-       0.830721869 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 214
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.626628 sec
+     2,745,317,317      cycles                           #    2.834 GHz                       
+     4,375,733,633      instructions                     #    1.59  insn per cycle            
+       1.030270764 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 97
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd0/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
-Avg ME (F77/GPU)   = 2.0158358639104246
-Relative difference = 6.751024171044779e-08
+Avg ME (F77/GPU)   = 2.0158359252211273
+Relative difference = 3.709571258359381e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd0/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.807535e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.852925e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.852925e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.733243e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.776197e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.776197e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.910224 sec
-INFO: No Floating Point Exceptions have been reported
-    17,388,420,068      cycles                           #    2.940 GHz                    
-    46,077,588,135      instructions                     #    2.65  insn per cycle         
-       5.916245730 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  622) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     6.158417 sec
+    17,715,939,856      cycles                           #    2.875 GHz                       
+    47,246,785,246      instructions                     #    2.67  insn per cycle            
+       6.164118051 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  697) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.226882e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.387878e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.387878e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.957728e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.098130e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.098130e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.360909 sec
-INFO: No Floating Point Exceptions have been reported
-     9,940,043,952      cycles                           #    2.953 GHz                    
-    27,598,360,403      instructions                     #    2.78  insn per cycle         
-       3.367569953 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2581) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.654698 sec
+    10,452,582,819      cycles                           #    2.857 GHz                       
+    28,908,318,583      instructions                     #    2.77  insn per cycle            
+       3.660369759 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2952) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.038546e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.426797e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.426797e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.875920e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.250078e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.250078e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.194996 sec
-INFO: No Floating Point Exceptions have been reported
-     6,084,814,623      cycles                           #    2.765 GHz                    
-    12,511,133,896      instructions                     #    2.06  insn per cycle         
-       2.201688699 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2776) (512y:    0) (512z:    0)
+TOTAL       :     2.259915 sec
+     6,093,808,686      cycles                           #    2.691 GHz                       
+    12,337,204,259      instructions                     #    2.02  insn per cycle            
+       2.265459762 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2881) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 5.589922e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 6.068248e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 6.068248e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.177033e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.591064e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.591064e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     1.988387 sec
-INFO: No Floating Point Exceptions have been reported
-     5,540,380,764      cycles                           #    2.778 GHz                    
-    11,938,541,192      instructions                     #    2.15  insn per cycle         
-       1.995322896 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2521) (512y:  146) (512z:    0)
+TOTAL       :     2.133158 sec
+     5,748,650,852      cycles                           #    2.689 GHz                       
+    11,995,867,966      instructions                     #    2.09  insn per cycle            
+       2.138612056 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2715) (512y:   49) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=0]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.615006e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.807457e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.807457e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.338127e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.505246e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.505246e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.015683 sec
-INFO: No Floating Point Exceptions have been reported
-     5,630,115,254      cycles                           #    1.863 GHz                    
-     8,130,918,173      instructions                     #    1.44  insn per cycle         
-       3.022730001 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1671) (512y:  126) (512z: 1865)
+TOTAL       :     3.249823 sec
+     5,782,616,223      cycles                           #    1.777 GHz                       
+     7,825,984,086      instructions                     #    1.35  insn per cycle            
+       3.255283004 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1669) (512y:   69) (512z: 1937)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
diff --git a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
index 44aa1a6a94..e76b553aa7 100644
--- a/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
+++ b/epochX/cudacpp/tput/logs_susyggtt_mad/log_susyggtt_mad_m_inl0_hrd1.txt
@@ -1,5 +1,8 @@
+MADGRAPH_CUDA_ARCHITECTURE=
+MADGRAPH_HIP_ARCHITECTURE=
 
-Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
+HASBLAS=hasBlas
+Building in /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx
 BACKEND=cpp512y (was cppauto)
 OMPFLAGS=
 FPTYPE='m'
@@ -7,233 +10,212 @@ HELINL='0'
 HRDCOD='0'
 HASCURAND=hasCurand
 HASHIPRAND=hasNoHiprand
+HASBLAS=hasBlas
 Building in BUILDDIR=build.auto_m_inl0_hrd0 for tag=512y_m_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR == 1)
 make: Nothing to be done for 'gtestlibs'.
 
 make USEBUILDDIR=1 BACKEND=cuda
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppnone
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppsse4
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cppavx2
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512y
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
 make USEBUILDDIR=1 BACKEND=cpp512z
-make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Entering directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 make[1]: Nothing to be done for 'all'.
-make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
+make[1]: Leaving directory '/data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx'
 
-DATE: 2024-10-06_09:57:48
+DATE: 2025-09-24_09:30:13
 
+HASBLAS=hasBlas
+CUDACPP_RUNTIME_BLASCOLORSUM=
+CUDACPP_RUNTIME_CUBLASTF32TENSOR=
 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]:
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.5.0)] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
-EvtsPerSec[Rmb+ME]     (23) = ( 4.308177e+07                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 9.314026e+07                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 9.965515e+07                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 9.373817e+06                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.047737e+07                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.055011e+07                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     0.530653 sec
-INFO: No Floating Point Exceptions have been reported
-     2,220,013,015      cycles                           #    2.891 GHz                    
-     3,185,773,009      instructions                     #    1.44  insn per cycle         
-       0.824701846 seconds time elapsed
-runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
-==PROF== Profiling "sigmaKin": launch__registers_per_thread 212
-==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100%
+TOTAL       :     0.624694 sec
+     2,775,081,370      cycles                           #    2.833 GHz                       
+     4,447,666,860      instructions                     #    1.60  insn per cycle            
+       1.037097767 seconds time elapsed
+.........................................................................
+runNcu /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe -p 2048 256 1
+==PROF== Profiling "diagram1": launch__registers_per_thread 98
+==PROF== Profiling "diagram1": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "diagram2": launch__registers_per_thread 70
+==PROF== Profiling "diagram2": sm__sass_average_branch_targets_threads_uniform.pct 100%
+==PROF== Profiling "color_sum_kernel": launch__registers_per_thread 26
+==PROF== Profiling "color_sum_kernel": sm__sass_average_branch_targets_threads_uniform.pct 0%
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/runTest_cuda.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/check_cuda.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.cuda_m_inl0_hrd1/fcheck_cuda.exe 2 64 2
 Avg ME (C++/GPU)   = 2.015836e+00
-Avg ME (F77/GPU)   = 2.0158358639104246
-Relative difference = 6.751024171044779e-08
+Avg ME (F77/GPU)   = 2.0158359252211273
+Relative difference = 3.709571258359381e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-Not found: /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
+Not found: /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.hip_m_inl0_hrd1/check_hip.exe
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = SCALAR ('none': ~vector[1], no SIMD)
-EvtsPerSec[Rmb+ME]     (23) = ( 1.857128e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 1.905464e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 1.905464e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 1.787280e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 1.833233e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 1.833233e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     5.753526 sec
-INFO: No Floating Point Exceptions have been reported
-    16,958,834,547      cycles                           #    2.945 GHz                    
-    45,095,701,979      instructions                     #    2.66  insn per cycle         
-       5.759360611 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:  567) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     5.974558 sec
+    17,164,367,416      cycles                           #    2.871 GHz                       
+    46,097,766,780      instructions                     #    2.69  insn per cycle            
+       5.980350288 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:  634) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.365466e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.544754e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.544754e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 2.980851e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.122264e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.122264e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.232551 sec
-INFO: No Floating Point Exceptions have been reported
-     9,533,065,833      cycles                           #    2.943 GHz                    
-    26,273,852,197      instructions                     #    2.76  insn per cycle         
-       3.239846074 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4: 2386) (avx2:    0) (512y:    0) (512z:    0)
+TOTAL       :     3.626612 sec
+    10,414,531,441      cycles                           #    2.869 GHz                       
+    28,891,877,424      instructions                     #    2.77  insn per cycle            
+       3.632098254 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4: 2946) (avx2:    0) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359218686011
 Relative difference = 3.8758807327712803e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.514012e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 4.821697e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 4.821697e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 4.880045e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.253649e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.253649e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.435584 sec
-INFO: No Floating Point Exceptions have been reported
-     6,758,526,375      cycles                           #    2.768 GHz                    
-    14,047,168,742      instructions                     #    2.08  insn per cycle         
-       2.442338814 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2895) (512y:    0) (512z:    0)
+TOTAL       :     2.257801 sec
+     6,087,823,639      cycles                           #    2.691 GHz                       
+    12,326,159,258      instructions                     #    2.02  insn per cycle            
+       2.263202472 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2865) (512y:    0) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 4.791737e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 5.138604e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 5.138604e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 5.183890e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 5.604893e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 5.604893e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     2.301242 sec
-INFO: No Floating Point Exceptions have been reported
-     6,403,253,635      cycles                           #    2.776 GHz                    
-    13,529,712,107      instructions                     #    2.11  insn per cycle         
-       2.307614270 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2531) (512y:  302) (512z:    0)
+TOTAL       :     2.129962 sec
+     5,750,241,658      cycles                           #    2.694 GHz                       
+    11,985,569,411      instructions                     #    2.08  insn per cycle            
+       2.135441578 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 2697) (512y:   49) (512z:    0)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
 OK (relative difference <= 5E-3)
 =========================================================================
-runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
-Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1]
+runExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe -p 2048 256 2 OMP=
+Process                     = SIGMA_MSSM_SLHA2_GG_TTX_CPP [gcc 11.5.0] [inlineHel=0] [hardcodePARAM=1]
 Workflow summary            = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK
 FP precision                = MIXED (NaN/abnormal=0, zero=0)
 Internal loops fptype_sv    = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES]
-EvtsPerSec[Rmb+ME]     (23) = ( 3.627313e+05                 )  sec^-1
-EvtsPerSec[MatrixElems] (3) = ( 3.823087e+05                 )  sec^-1
-EvtsPerSec[MECalcOnly] (3a) = ( 3.823087e+05                 )  sec^-1
+EvtsPerSec[Rmb+ME]     (23) = ( 3.334001e+05                 )  sec^-1
+EvtsPerSec[MatrixElems] (3) = ( 3.500633e+05                 )  sec^-1
+EvtsPerSec[MECalcOnly] (3a) = ( 3.500633e+05                 )  sec^-1
 MeanMatrixElemValue         = ( 2.072848e+00 +- 3.360985e-03 )  GeV^0
-TOTAL       :     3.002431 sec
-INFO: No Floating Point Exceptions have been reported
-     5,614,669,392      cycles                           #    1.866 GHz                    
-     9,218,497,811      instructions                     #    1.64  insn per cycle         
-       3.009264991 seconds time elapsed
-=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1456) (512y:  212) (512z: 2059)
+TOTAL       :     3.252743 sec
+     5,780,173,629      cycles                           #    1.775 GHz                       
+     7,818,044,121      instructions                     #    1.35  insn per cycle            
+       3.258145265 seconds time elapsed
+=Symbols in CPPProcess_cpp.o= (~sse4:    0) (avx2: 1655) (512y:   69) (512z: 1933)
 -------------------------------------------------------------------------
-runTest /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
-INFO: The following Floating Point Exceptions will cause SIGFPE program aborts: FE_DIVBYZERO, FE_INVALID, FE_OVERFLOW
+runTest /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest_cpp.exe
 [  PASSED  ] 4 tests.
 DEBUG: MEK (no multichannel) processed 512 events across 3 channels { no-multichannel : 512 }
-INFO: No Floating Point Exceptions have been reported
 DEBUG: MEK (channelid array) processed 512 events across 3 channels { 1 : 192, 2 : 160, 3 : 160 }
-INFO: No Floating Point Exceptions have been reported
 -------------------------------------------------------------------------
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
-cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check_cpp.exe --common -p 2 64 2
+cmpExe /data/avalassi/GPU2025/test-madgraph4gpu/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck_cpp.exe 2 64 2
 Avg ME (C++/C++)    = 2.015836e+00
 Avg ME (F77/C++)    = 2.0158359178371690
 Relative difference = 4.0758688308634e-08
diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh
index 088371cb95..c4180b6725 100755
--- a/epochX/cudacpp/tput/teeThroughputX.sh
+++ b/epochX/cudacpp/tput/teeThroughputX.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 scrdir=$(cd $(dirname $0); pwd)
 bckend=$(basename $(cd $scrdir; cd ..; pwd)) # cudacpp or alpaka
@@ -10,7 +10,7 @@ cd $scrdir
 
 function usage()
 {
-  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-makeonly] [-makeclean] [-makej] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
+  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-nocuda] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-makeonly] [-makeclean] [-makej] [-scaling] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
   exit 1
 }
 
@@ -33,8 +33,10 @@ helinls="0"
 hrdcods="0"
 rndgen=
 rmbsmp=
+blas="" # build with blas but disable it at runtime
 steps="make test"
 makej=
+scaling=
 ###nofpe=
 dlp=
 dlpset=0
@@ -117,6 +119,12 @@ for arg in $*; do
     rmbsmp=$arg
   elif [ "$arg" == "-bridge" ]; then
     rmbsmp=$arg
+  elif [ "$arg" == "-noBlas" ]; then # build with blas but disable it at runtime
+    if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$arg
+  elif [ "$arg" == "-blasOn" ]; then # build with blas and enable it at runtime
+    if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$arg
   elif [ "$arg" == "-makeonly" ]; then
     if [ "${steps}" == "make test" ]; then
       steps="make"
@@ -131,6 +139,8 @@ for arg in $*; do
     fi
   elif [ "$arg" == "-makej" ]; then
     makej=-makej
+  elif [ "$arg" == "-scaling" ]; then
+    scaling=$arg
   ###elif [ "$arg" == "-nofpe" ]; then
   ###  nofpe=-nofpe
   else
@@ -175,6 +185,8 @@ for step in $steps; do
             args="${args} ${alpaka}" # optionally disable alpaka tests
             args="${args} ${rndgen}" # optionally use common random numbers or curand on host
             args="${args} ${rmbsmp}" # optionally use rambo or bridge on host
+            args="${args} ${scaling}" # optionally run scaling tests
+            args="${args} ${blas}" # optionally build with no blas or instead enable it at runtime
             ###args="${args} ${nofpe}" # optionally disable FPEs
             args="${args} ${bldall}" # avx, fptype, helinl and hrdcod are now supported for all processes
             if [ "${step}" == "makeclean" ]; then
@@ -191,6 +203,8 @@ for step in $steps; do
               logfile=logs_${proc#-}_${sufflog}/log_${proc#-}_${sufflog}_${fptype}_inl${helinl}_hrd${hrdcod}.txt
               if [ "${rndgen}" != "" ]; then logfile=${logfile%.txt}_${rndgen#-}.txt; fi
               if [ "${rmbsmp}" != "" ]; then logfile=${logfile%.txt}_${rmbsmp#-}.txt; fi
+              if [ "${blas}" != "" ]; then logfile=${logfile%.txt}_${blas#-}.txt; fi
+              if [ "${scaling}" != "" ]; then logfile=${logfile%.txt}.scaling; fi
               printf "\n%80s\n" |tr " " "*"
               printf "*** ./throughputX.sh $args | tee $logfile"
               printf "\n%80s\n" |tr " " "*"
diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh
index 68df662f58..78316f963b 100755
--- a/epochX/cudacpp/tput/throughputX.sh
+++ b/epochX/cudacpp/tput/throughputX.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
-# Copyright (C) 2020-2024 CERN and UCLouvain.
+# Copyright (C) 2020-2025 CERN and UCLouvain.
 # Licensed under the GNU Lesser General Public License (version 3 or later).
 # Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin.
-# Further modified by: A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin.
+# Further modified by: A. Valassi (2021-2025) for the MG5aMC CUDACPP plugin.
 
 set +x # not verbose
 set -e # fail on error
@@ -19,7 +19,7 @@ export MG5AMC_CHANNELID_DEBUG=1
 
 function usage()
 {
-  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-v] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
+  echo "Usage: $0 <processes [-eemumu][-ggtt][-ggttg][-ggttgg][-ggttggg][-gqttq][-heftggbb][-susyggtt][-susyggt1t1][-smeftggtttt]> [-bldall|-nocuda|-cpponly|-cudaonly|-hiponly|-noneonly|-sse4only|-avx2only|-512yonly|-512zonly] [-sa] [-noalpaka] [-dblonly|-fltonly|-d_f|-dmf] [-inl|-inlonly] [-hrd|-hrdonly] [-common|-curhst] [-rmbhst|-bridge] [-noBlas|-blasOn] [-omp] [-makeonly|-makeclean|-makecleanonly|-dryrun] [-makej] [-3a3b] [-div] [-req] [-detailed] [-gtest(default)|-nogtest] [-scaling] [-v] [-dlp <dyld_library_path>]" # -nofpe is no longer supported
   exit 1
 }
 
@@ -49,7 +49,9 @@ fptypes="m" # new default #995 (was "d")
 helinls="0"
 hrdcods="0"
 rndgen=""
-rmbsam=""
+rmbsmp=""
+
+blas="" # build with blas but disable it at runtime
 
 maketype=
 makej=
@@ -59,6 +61,7 @@ div=0
 req=0
 detailed=0
 gtest=
+scaling=0
 ###nofpe=0
 verbose=0
 
@@ -211,6 +214,14 @@ while [ "$1" != "" ]; do
   elif [ "$1" == "-bridge" ]; then
     rmbsmp=" -${1}"
     shift
+  elif [ "$1" == "-noBlas" ]; then # build without blas
+    if [ "${blas}" == "-blasOn" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$1
+    shift
+  elif [ "$1" == "-blasOn" ]; then # build with blas and enable it at runtime
+    if [ "${blas}" == "-noBlas" ]; then echo "ERROR! Options -noBlas and -blasOn are incompatible"; usage; fi
+    blas=$1    
+    shift
   elif [ "$1" == "-makeonly" ] || [ "$1" == "-makeclean" ] || [ "$1" == "-makecleanonly" ] || [ "$1" == "-dryrun" ]; then
     if [ "${maketype}" != "" ] && [ "${maketype}" != "$1" ]; then
       echo "ERROR! Options -makeonly, -makeclean, -makecleanonly and -dryrun are incompatible"; usage
@@ -245,6 +256,9 @@ while [ "$1" != "" ]; do
     fi
     gtest=0
     shift
+  elif [ "$1" == "-scaling" ]; then
+    scaling=1
+    shift
   ###elif [ "$1" == "-nofpe" ]; then
   ###  nofpe=1
   ###  shift
@@ -371,6 +385,9 @@ function showdir()
   echo $dir
 }
 
+echo MADGRAPH_CUDA_ARCHITECTURE=${MADGRAPH_CUDA_ARCHITECTURE}
+echo MADGRAPH_HIP_ARCHITECTURE=${MADGRAPH_HIP_ARCHITECTURE}
+
 ###echo -e "\n********************************************************************************\n"
 printf "\n"
 
@@ -434,6 +451,13 @@ done
 # PART 2 - build the executables which should be run
 ##########################################################################
 
+if [ "${blas}" == "-noBlas" ]; then
+  export HASBLAS=hasNoBlas
+else
+  export HASBLAS=hasBlas
+fi
+echo HASBLAS=${HASBLAS}
+
 unset GTEST_ROOT
 unset LOCALGTEST
 
@@ -497,6 +521,18 @@ if [ "${maketype}" != "-dryrun" ]; then
   printf "DATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n"
 fi
 
+echo HASBLAS=${HASBLAS}
+
+if [ "${blas}" == "-blasOn" ]; then
+  export CUDACPP_RUNTIME_BLASCOLORSUM=1
+else
+  unset CUDACPP_RUNTIME_BLASCOLORSUM
+fi
+echo CUDACPP_RUNTIME_BLASCOLORSUM=${CUDACPP_RUNTIME_BLASCOLORSUM}
+
+unset CUDACPP_RUNTIME_CUBLASTF32TENSOR
+echo CUDACPP_RUNTIME_CUBLASTF32TENSOR=${CUDACPP_RUNTIME_CUBLASTF32TENSOR}
+
 function runExe() {
   exe1=$1
   args="$2"
@@ -507,6 +543,7 @@ function runExe() {
   # Optionally add other patterns here for some specific configurations (e.g. clang)
   if [ "${exe1%%/check_cuda*}" != "${exe1}" ] || [ "${exe1%%/check_hip*}" != "${exe1}" ]; then pattern="${pattern}|EvtsPerSec\[Matrix"; fi
   pattern="${pattern}|Workflow"
+  ###pattern="${pattern}|BLASCOLORSUM"
   ###pattern="${pattern}|CUCOMPLEX"
   ###pattern="${pattern}|COMMON RANDOM|CURAND HOST \(CUDA"
   pattern="${pattern}|ERROR"
@@ -523,7 +560,7 @@ function runExe() {
     if [ "${detailed}" == "1" ]; then pattern="${pattern}|#"; fi
     if [ "${verbose}" == "1" ]; then set -x; fi
     ###perf stat -d $exe1 $args 2>&1 | grep -v "Performance counter stats"
-    perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats"
+    perf stat -d $exe1 $args 2>&1 | egrep "(${pattern})" | grep -v "Performance counter stats" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'
     set +x
   else
     # -- Older version using time
@@ -539,6 +576,7 @@ function runTest() {
   echo "runTest $exe1"
   if [ "${maketype}" == "-dryrun" ]; then return; fi
   pattern="PASS|FAIL"
+  ###pattern="${pattern}|BLASCOLORSUM"
   pattern="${pattern}|ERROR"
   pattern="${pattern}|WARNING"
   pattern="${pattern}|Floating Point Exception"
@@ -563,10 +601,12 @@ function cmpExe() {
     echo "ERROR! C++ calculation (C++${tag} failed"; exit 1 # expose FPE crash #1003 on HIP
   fi
   me1=$(cat ${tmp1} | grep MeanMatrix | awk '{print $4}'); cat ${tmp2}
+  ###cat ${tmp1} | grep BLASCOLORSUM
   if ! ${exef} ${argsf} 2>${tmp2} >${tmp1}; then
     echo "ERROR! Fortran calculation (F77${tag} failed"; exit 1
   fi
   me2=$(cat ${tmp1} | grep Average | awk '{print $4}'); cat ${tmp2}
+  ###cat ${tmp1} | grep BLASCOLORSUM
   echo -e "Avg ME (C++${tag}   = ${me1}\nAvg ME (F77${tag}   = ${me2}"
   if [ "${me2}" == "NaN" ]; then
     echo "ERROR! Fortran calculation (F77${tag} returned NaN"; exit 1
@@ -588,16 +628,24 @@ function runNcu() {
   args="$2"
   args="$args$rndgen$rmbsmp"
   echo "runNcu $exe1 $args"
-  if [ "${verbose}" == "1" ]; then set -x; fi
-  #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}'
-  set +e # do not fail on error
-  out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args)
-  echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes
-  set -e # fail on error (after ncu and after egrep!)
-  out=$(echo "${out}" | egrep '(sigmaKin|registers| sm)' | tr "\n" " ") # NB must escape $out in between quotes
-  echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}'
-  echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}'
-  set +x
+  ###echoblas=1
+  ###kernels="calculate_jamps color_sum_kernel" # before kernel splitting
+  kernels="diagram1 diagram2 color_sum_kernel" # with kernel splitting
+  ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels kernel"; fi # heavy to profile...
+  ###if [ "${CUDACPP_RUNTIME_BLASCOLORSUM}" == "1" ]; then kernels="$kernels regex:gemm"; fi # output to be improved...
+  for kernel in $kernels; do
+    if [ "${verbose}" == "1" ]; then set -x; fi
+    #$(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm)' | tr "\n" " " | awk '{print $1, $2, $3, $15, $17; print $1, $2, $3, $18, $20$19}'
+    set +e # do not fail on error
+    out=$($(which ncu) --metrics launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::${kernel}:" --kernel-name-base function $exe1 $args)
+    echo "$out" | egrep '(ERROR|WARNING)' # NB must escape $out in between quotes
+    ###if [ "${echoblas}" == "1" ]; then echo "$out" | egrep '(BLASCOLORSUM)'; echoblas=0; fi
+    set -e # fail on error (after ncu and after egrep!)
+    out=$(echo "${out}" | egrep "(${kernel}|registers| sm)" | tr "\n" " ") # NB must escape $out in between quotes
+    echo $out | awk -v key1="launch__registers_per_thread" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)}; print $1, $2, $3, key1, val1}'
+    echo $out | awk -v key1="sm__sass_average_branch_targets_threads_uniform.pct" '{val1="N/A"; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+2)$(i+1)}; print $1, $2, $3, key1, val1}'
+    set +x
+  done
 }
 
 # Profile divergence metrics more in detail
@@ -613,11 +661,11 @@ function runNcuDiv() {
   ###echo "runNcuDiv $exe1 $args"
   if [ "${verbose}" == "1" ]; then set -x; fi
   ###$(which ncu) --query-metrics $exe1 $args
-  ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args
-  ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args
-  ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}'
-  #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}'
-  out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin| sm)' | tr "\n" " ")
+  ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args
+  ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args
+  ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}'
+  #$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}'
+  out=$($(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps| sm)' | tr "\n" " ")
   ###echo $out
   echo $out | awk -v key1="smsp__sass_branch_targets.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}'
   echo $out | awk -v key1="smsp__sass_branch_targets_threads_uniform.sum" '{key2=key1".per_second"; val1="N/A"; val2=""; for (i=1; i<=NF; i++){if ($i==key1 && $(i+1)!="(!)") val1=$(i+1); if ($i==key2 && $(i+1)!="(!)") val2=$(i+2)$(i+1)}; printf "%29s: %-51s %-10s %s\n", "", key1, val1, val2}'
@@ -637,7 +685,7 @@ function runNcuReq() {
   for args in "-p 1 1 1" "-p 1 4 1" "-p 1 8 1" "-p 1 32 1" "$ncuArgs"; do
     ###echo "runNcuReq $exe1 $args"
     # NB This will print nothing if $args are invalid (eg "-p 1 4 1" when neppR=8)
-    $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::sigmaKin:" --kernel-name-base function $exe1 $args | egrep '(sigmaKin|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}'
+    $(which ncu) --metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum,launch__registers_per_thread,sm__sass_average_branch_targets_threads_uniform.pct --target-processes all --kernel-id "::calculate_jamps:" --kernel-name-base function $exe1 $args | egrep '(calculate_jamps|registers| sm|l1tex)' | tr "\n" " " | awk -vtag="[$args]" '{print $1, $2, $3, $16"s", $17";", $19"s", $20, tag}'
   done
   set +x
 }
@@ -659,10 +707,19 @@ else
 fi
 echo -e "On $HOSTNAME [CPU: $cpuTxt] [GPU: $gpuTxt]:"
 
+# Configure scaling tests
+if [ "${scaling}" == "0" ]; then # no scaling tests (throughput tests only)
+  exesSc=
+elif [ "${scaling}" == "1" ]; then # scaling tests only (skip throughput tests)
+  exesSc=$exes
+  exes=
+fi
+
 # These two settings are needed by BMK containers: do not change them
 BMKEXEARGS="" # if BMKEXEARGS is set, exeArgs is set equal to BMKEXEARGS, while exeArgs2 is set to ""
 BMKMULTIPLIER=1 # the pre-defined numbers of iterations (including those in BMKEXEARGS) are multiplied by BMKMULTIPLIER
 
+# (1) TRADITIONAL THROUGHPUT TESTS
 ###lastExe=
 lastExeDir=
 ###echo "exes=$exes"
@@ -699,7 +756,7 @@ for exe in $exes; do
     exeArgs="-p 1 256 2"
     ncuArgs="-p 1 256 1"
     # For smeftggtttt, use the same settings as for ggttggg (may be far too short!)
-    exeArgs2="-p 64 256 1"
+    ###exeArgs2="-p 64 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams and diagram kernel splitting
   elif [ "${exe%%/susy_gg_tt*}" != "${exe}" ]; then 
     # For susyggtt, use the same settings as for SM ggtt
     exeArgs="-p 2048 256 2"
@@ -713,27 +770,27 @@ for exe in $exes; do
     exeArgs="-p 64 256 10"
     ncuArgs="-p 64 256 1"
     # For gqttq, use the same settings as for ggttg
-    exeArgs2="-p 2048 256 1"
+    ###exeArgs2="-p 2048 256 1" # Sep 2025: remove this for consistency as all others were removed (and not needed as the plateau is reached earlier)
   elif [ "${exe%%/gg_ttggg*}" != "${exe}" ]; then 
     # For ggttggg: this is far too little for GPU (4.8E2), but it keeps the CPU to a manageble level (1sec with 512y)
     ###exeArgs="-p 1 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056
     exeArgs="-p 1 256 2"
     ncuArgs="-p 1 256 1"
     # For ggttggg: on GPU test also "64 256" to reach the plateau (only ~5% lower than "2048 256": 1.18E4 vs 1.25E4 on cuda116/gcc102)
-    exeArgs2="-p 64 256 1"
+    ###exeArgs2="-p 64 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams and diagram kernel splitting
   elif [ "${exe%%/gg_ttgg*}" != "${exe}" ]; then 
     # For ggttgg (OLD): this is a good GPU middle point: tput is 1.5x lower with "32 256 1", only a few% higher with "128 256 1"
     exeArgs="-p 64 256 1"
     ncuArgs="-p 64 256 1"
     # For ggttgg (NEW): on GPU test both "64 256" and "2048 256" for ggttgg as the latter gives ~10% higher throughput on cuda110/gcc92
-    exeArgs2="-p 2048 256 1"
+    ###exeArgs2="-p 2048 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams
   elif [ "${exe%%/gg_ttg*}" != "${exe}" ]; then 
     # For ggttg, as on ggttgg: this is a good GPU middle point: tput is 1.5x lower with "32 256 1", only a few% higher with "128 256 1"
     ###exeArgs="-p 64 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056
     exeArgs="-p 64 256 10"
     ncuArgs="-p 64 256 1"
     # For ggttg, as on ggttgg: on GPU test both "64 256" and "2048 256" for ggttg as the latter gives ~10% higher throughput on cuda110/gcc92
-    exeArgs2="-p 2048 256 1"
+    ###exeArgs2="-p 2048 256 1" # Sep 2025: aborts (and not needed as the plateau is reached earlier) with helicity streams and diagram kernel splitting
   elif [ "${exe%%/gg_tt*}" != "${exe}" ]; then 
     ###exeArgs="-p 2048 256 1" # too short! see https://its.cern.ch/jira/browse/BMK-1056
     exeArgs="-p 2048 256 2"
@@ -760,9 +817,16 @@ for exe in $exes; do
       unset OMP_NUM_THREADS
     fi
   elif [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]] || [ "${exe%%/alpcheck*}" != "${exe}" ]; then
+    echo "........................................................................."
     runNcu $exe "$ncuArgs"
-    if [ "${div}" == "1" ]; then runNcuDiv $exe; fi
-    if [ "${req}" == "1" ]; then runNcuReq $exe "$ncuArgs"; fi
+    if [ "${div}" == "1" ]; then
+      echo "........................................................................."
+      runNcuDiv $exe
+    fi
+    if [ "${req}" == "1" ]; then
+      echo "........................................................................."
+      runNcuReq $exe "$ncuArgs"
+    fi
     if [ "${exeArgs2}" != "" ]; then echo "........................................................................."; runExe $exe "$exeArgs2"; fi
   fi
   if [ "${gtest}" == "1" ]; then
@@ -777,6 +841,46 @@ for exe in $exes; do
     cmpExe $exe
   fi
 done
+###echo "========================================================================="
+
+# (2) SCALING TESTS
+lastExeDir=
+for exe in $exesSc; do
+  if [ "$(basename $(dirname $exe))" != "$lastExeDir" ]; then
+    echo "========================================================================="
+    lastExeDir=$(basename $(dirname $exe))
+  else
+    echo "-------------------------------------------------------------------------"
+  fi
+  echo "scalingTest $exe"
+  if [ ! -f $exe ]; then echo "Not found: $exe"; continue; fi
+  if [ "${unamep}" != "x86_64" ]; then
+    if [ "${exe/build.avx2}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi
+    if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi
+    if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unamep}"; continue; fi
+  elif [ "${unames}" == "Darwin" ]; then
+    if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi
+    if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported on ${unames}"; continue; fi
+  elif [ "$(grep -m1 -c avx512vl /proc/cpuinfo)" != "1" ]; then
+    if [ "${exe/build.512y}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi
+    if [ "${exe/build.512z}" != "${exe}" ]; then echo "$exe is not supported (no avx512vl in /proc/cpuinfo)"; continue; fi
+  fi
+  exeDir=$(dirname $exe)
+  cd $exeDir/.. # workaround for reading '../../Cards/param_card.dat' without setting MG5AMC_CARD_PATH
+  unset OMP_NUM_THREADS
+  # Scaling test with 256 threads per block
+  if [[ "${exe%%/check_cuda*}" != "${exe}" || "${exe%%/check_hip*}" != "${exe}" ]]; then
+    echo "### GPU: scaling test 256"
+    for b in 1 2 4 8 16 32 64 128 256 512 1024; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ) |& sed "s/Gpu.*Assert/Assert/" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done
+    echo "### GPU: scaling test 32"
+    for b in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ) |& sed "s/Gpu.*Assert/Assert/" |& sed 's/.*rocdevice.cpp.*Aborting.*/rocdevice.cpp: Aborting/'; done
+  else
+    echo "### CPU: scaling test 256"
+    for b in 1 2 4; do ( $exe -p $b 256 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 256}" ); done
+    echo "### CPU: scaling test 32"
+    for b in 1 2 4; do ( $exe -p $b 32 1 | \grep "EvtsPerSec\[MECalcOnly\]" | awk -vb=$b "{printf \"%s %4d %3d\n\", \$5, b, 32}" ); done
+  fi
+done
 echo "========================================================================="
 
 # Workaround for reading of data files